1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* SPDX-License-Identifier: GPL-2.0-only */ /* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ #include <linux/refcount.h> #ifndef _L2TP_CORE_H_ #define _L2TP_CORE_H_ #include <net/dst.h> #include <net/sock.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif /* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D /* Per tunnel session hash table size */ #define L2TP_HASH_BITS 4 #define L2TP_HASH_SIZE BIT(L2TP_HASH_BITS) /* System-wide session hash table size */ #define L2TP_HASH_BITS_2 8 #define L2TP_HASH_SIZE_2 BIT(L2TP_HASH_BITS_2) struct sk_buff; struct l2tp_stats { atomic_long_t tx_packets; atomic_long_t tx_bytes; atomic_long_t tx_errors; atomic_long_t rx_packets; atomic_long_t rx_bytes; atomic_long_t rx_seq_discards; atomic_long_t rx_oos_packets; atomic_long_t rx_errors; atomic_long_t rx_cookie_discards; atomic_long_t rx_invalid; }; struct l2tp_tunnel; /* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; /* Represents a session (pseudowire) instance. * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into * an additional per-net ("global") hashlist. */ #define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; u16 l2specific_type; u16 hdr_len; u32 nr; /* session NR state (receive) */ u32 ns; /* session NR state (send) */ struct sk_buff_head reorder_q; /* receive reorder queue */ u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; struct hlist_node hlist; /* hash list node */ refcount_t ref_count; char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct hlist_node global_hlist; /* global hash list node */ /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to * handle incoming packets. Packets are passed to the pseudowire handler after * reordering, if data sequence numbers are enabled for the session. */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); /* Session close handler. * Each pseudowire implementation may implement this callback in order to carry * out pseudowire-specific shutdown actions. * The callback is called by core after unhashing the session and purging its * reorder queue. */ void (*session_close)(struct l2tp_session *session); /* Session show handler. * Pseudowire-specific implementation of debugfs session rendering. * The callback is called by l2tp_debugfs.c after rendering core session * information. */ void (*show)(struct seq_file *m, void *priv); u8 priv[]; /* private data */ }; /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ struct in_addr local_ip; struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr *local_ip6; struct in6_addr *peer_ip6; #endif u16 local_udp_port; u16 peer_udp_port; unsigned int use_udp_checksums:1, udp6_zero_tx_checksums:1, udp6_zero_rx_checksums:1; }; /* Represents a tunnel instance. * Tracks runtime state including IO statistics. * Holds the tunnel socket (either passed from userspace or directly created by the kernel). * Maintains a hashlist of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ #define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ unsigned long dead; struct rcu_head rcu; spinlock_t hlist_lock; /* write-protection for session_hlist */ bool acpt_newsess; /* indicates whether this tunnel accepts * new sessions. Protected by hlist_lock. */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, hashed by id */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ enum l2tp_encap_type encap; struct l2tp_stats stats; struct list_head list; /* list node on per-namespace list of tunnels */ struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; void (*old_sk_destruct)(struct sock *sk); struct sock *sock; /* parent socket */ int fd; /* parent fd, if tunnel socket was created * by userspace */ struct work_struct del_work; }; /* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { /* The pseudowire session create callback is responsible for creating a session * instance for a specific pseudowire type. * It must call l2tp_session_create and l2tp_session_register to register the * session instance, as well as carry out any pseudowire-specific initialisation. * It must return >= 0 on success, or an appropriate negative errno value on failure. */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); /* The pseudowire session delete callback is responsible for initiating the deletion * of a session instance. * It must call l2tp_session_delete, as well as carry out any pseudowire-specific * teardown actions. */ void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; } /* Tunnel and session refcounts */ void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel); void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel); void l2tp_session_inc_refcount(struct l2tp_session *session); void l2tp_session_dec_refcount(struct l2tp_session *session); /* Tunnel and session lookup. * These functions take a reference on the instances they return, so * the caller must ensure that the reference is dropped appropriately. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, u32 session_id); struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); /* Tunnel and session lifetime management. * Creation of a new instance is a two-step process: create, then register. * Destruction is triggered using the *_delete functions, and completes asynchronously. */ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); void l2tp_session_delete(struct l2tp_session *session); /* Receive path helpers. If data sequencing is enabled for the session these * functions handle queuing and reordering prior to passing packets to the * pseudowire code to be passed to userspace. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. * Pseudowires should register with l2tp core on module init, and unregister * on module exit. */ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, int *karg); /* Extract the tunnel structure from a socket's sk_user_data pointer, * validating the tunnel magic feather. */ struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { switch (session->l2specific_type) { case L2TP_L2SPECTYPE_DEFAULT: return 4; case L2TP_L2SPECTYPE_NONE: default: return 0; } } static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) { struct dst_entry *dst; u32 mtu; dst = sk_dst_get(tunnel->sock); if (!dst) return 0; mtu = dst_mtu(dst); dst_release(dst); return mtu; } #ifdef CONFIG_XFRM static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; return sk && (rcu_access_pointer(sk->sk_policy[0]) || rcu_access_pointer(sk->sk_policy[1])); } #else static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { return false; } #endif static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb, unsigned char **ptr, unsigned char **optr) { int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session); if (opt_len > 0) { int off = *ptr - *optr; if (!pskb_may_pull(skb, off + opt_len)) return -1; if (skb->data != *optr) { *optr = skb->data; *ptr = skb->data + off; } } return 0; } #define MODULE_ALIAS_L2TP_PWTYPE(type) \ MODULE_ALIAS("net-l2tp-type-" __stringify(type)) #endif /* _L2TP_CORE_H_ */
1 3 3 3 2 2 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Neil Brown 2002 * Copyright (C) Christoph Hellwig 2007 * * This file contains the code mapping from inodes to NFS file handles, * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here * take a look at Documentation/filesystems/nfs/exporting.rst. */ #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/cred.h> #define dprintk(fmt, args...) pr_debug(fmt, ##args) static int get_name(const struct path *path, char *name, struct dentry *child); static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir, char *name, struct dentry *child) { const struct export_operations *nop = dir->d_sb->s_export_op; struct path path = {.mnt = mnt, .dentry = dir}; if (nop->get_name) return nop->get_name(dir, name, child); else return get_name(&path, name, child); } /* * Check if the dentry or any of it's aliases is acceptable. */ static struct dentry * find_acceptable_alias(struct dentry *result, int (*acceptable)(void *context, struct dentry *dentry), void *context) { struct dentry *dentry, *toput = NULL; struct inode *inode; if (acceptable(context, result)) return result; inode = result->d_inode; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; } spin_lock(&inode->i_lock); toput = dentry; } spin_unlock(&inode->i_lock); if (toput) dput(toput); return NULL; } static bool dentry_connected(struct dentry *dentry) { dget(dentry); while (dentry->d_flags & DCACHE_DISCONNECTED) { struct dentry *parent = dget_parent(dentry); dput(dentry); if (dentry == parent) { dput(parent); return false; } dentry = parent; } dput(dentry); return true; } static void clear_disconnected(struct dentry *dentry) { dget(dentry); while (dentry->d_flags & DCACHE_DISCONNECTED) { struct dentry *parent = dget_parent(dentry); WARN_ON_ONCE(IS_ROOT(dentry)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_DISCONNECTED; spin_unlock(&dentry->d_lock); dput(dentry); dentry = parent; } dput(dentry); } /* * Reconnect a directory dentry with its parent. * * This can return a dentry, or NULL, or an error. * * In the first case the returned dentry is the parent of the given * dentry, and may itself need to be reconnected to its parent. * * In the NULL case, a concurrent VFS operation has either renamed or * removed this directory. The concurrent operation has reconnected our * dentry, so we no longer need to. */ static struct dentry *reconnect_one(struct vfsmount *mnt, struct dentry *dentry, char *nbuf) { struct dentry *parent; struct dentry *tmp; int err; parent = ERR_PTR(-EACCES); inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } dprintk("%s: find name of %lu in %lu\n", __func__, dentry->d_inode->i_ino, parent->d_inode->i_ino); err = exportfs_get_name(mnt, parent, nbuf, dentry); if (err == -ENOENT) goto out_reconnected; if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); goto out_err; } if (tmp != dentry) { /* * Somebody has renamed it since exportfs_get_name(); * great, since it could've only been renamed if it * got looked up and thus connected, and it would * remain connected afterwards. We are done. */ dput(tmp); goto out_reconnected; } dput(tmp); if (IS_ROOT(dentry)) { err = -ESTALE; goto out_err; } return parent; out_err: dput(parent); return ERR_PTR(err); out_reconnected: dput(parent); /* * Someone must have renamed our entry into another parent, in * which case it has been reconnected by the rename. * * Or someone removed it entirely, in which case filehandle * lookup will succeed but the directory is now IS_DEAD and * subsequent operations on it will fail. * * Alternatively, maybe there was no race at all, and the * filesystem is just corrupt and gave us a parent that doesn't * actually contain any entry pointing to this inode. So, * double check that this worked and return -ESTALE if not: */ if (!dentry_connected(dentry)) return ERR_PTR(-ESTALE); return NULL; } /* * Make sure target_dir is fully connected to the dentry tree. * * On successful return, DCACHE_DISCONNECTED will be cleared on * target_dir, and target_dir->d_parent->...->d_parent will reach the * root of the filesystem. * * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected. * But the converse is not true: target_dir may have DCACHE_DISCONNECTED * set but already be connected. In that case we'll verify the * connection to root and then clear the flag. * * Note that target_dir could be removed by a concurrent operation. In * that case reconnect_path may still succeed with target_dir fully * connected, but further operations using the filehandle will fail when * necessary (due to S_DEAD being set on the directory). */ static int reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) { struct dentry *dentry, *parent; dentry = dget(target_dir); while (dentry->d_flags & DCACHE_DISCONNECTED) { BUG_ON(dentry == mnt->mnt_sb->s_root); if (IS_ROOT(dentry)) parent = reconnect_one(mnt, dentry, nbuf); else parent = dget_parent(dentry); if (!parent) break; dput(dentry); if (IS_ERR(parent)) return PTR_ERR(parent); dentry = parent; } dput(dentry); clear_disconnected(target_dir); return 0; } struct getdents_callback { struct dir_context ctx; char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ u64 ino; /* the inum we are looking for */ int found; /* inode matched? */ int sequence; /* sequence counter */ }; /* * A rather strange filldir function to capture * the name matching the specified inode number. */ static bool filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); buf->sequence++; if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; return false; // no more } return true; } /** * get_name - default export_operations->get_name function * @path: the directory in which to find a name * @name: a pointer to a %NAME_MAX+1 char buffer to store the name * @child: the dentry for the child directory. * * calls readdir on the parent until it finds an entry with * the same inode number as the child, and returns that. */ static int get_name(const struct path *path, char *name, struct dentry *child) { const struct cred *cred = current_cred(); struct inode *dir = path->dentry->d_inode; int error; struct file *file; struct kstat stat; struct path child_path = { .mnt = path->mnt, .dentry = child, }; struct getdents_callback buffer = { .ctx.actor = filldir_one, .name = name, }; error = -ENOTDIR; if (!dir || !S_ISDIR(dir->i_mode)) goto out; error = -EINVAL; if (!dir->i_fop) goto out; /* * inode->i_ino is unsigned long, kstat->ino is u64, so the * former would be insufficient on 32-bit hosts when the * filesystem supports 64-bit inode numbers. So we need to * actually call ->getattr, not just read i_ino: */ error = vfs_getattr_nosec(&child_path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (error) return error; buffer.ino = stat.ino; /* * Open the directory ... */ file = dentry_open(path, O_RDONLY, cred); error = PTR_ERR(file); if (IS_ERR(file)) goto out; error = -EINVAL; if (!file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; while (1) { int old_seq = buffer.sequence; error = iterate_dir(file, &buffer.ctx); if (buffer.found) { error = 0; break; } if (error < 0) break; error = -ENOENT; if (old_seq == buffer.sequence) break; } out_close: fput(file); out: return error; } #define FILEID_INO64_GEN_LEN 3 /** * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * * This generic function is used to encode a non-decodeable file id for * fanotify for filesystems that do not support NFS export. */ static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid, int *max_len) { if (*max_len < FILEID_INO64_GEN_LEN) { *max_len = FILEID_INO64_GEN_LEN; return FILEID_INVALID; } fid->i64.ino = inode->i_ino; fid->i64.gen = inode->i_generation; *max_len = FILEID_INO64_GEN_LEN; return FILEID_INO64_GEN; } /** * exportfs_encode_inode_fh - encode a file handle from inode * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @parent: parent directory inode, if wanted * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. */ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags) { const struct export_operations *nop = inode->i_sb->s_export_op; if (!exportfs_can_encode_fh(nop, flags)) return -EOPNOTSUPP; if (!nop && (flags & EXPORT_FH_FID)) return exportfs_encode_ino64_fid(inode, fid, max_len); return nop->encode_fh(inode, fid->raw, max_len, parent); } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); /** * exportfs_encode_fh - encode a file handle from dentry * @dentry: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. */ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int flags) { int error; struct dentry *p = NULL; struct inode *inode = dentry->d_inode, *parent = NULL; if ((flags & EXPORT_FH_CONNECTABLE) && !S_ISDIR(inode->i_mode)) { p = dget_parent(dentry); /* * note that while p might've ceased to be our parent already, * it's still pinned by and still positive. */ parent = p->d_inode; } error = exportfs_encode_inode_fh(inode, fid, max_len, parent, flags); dput(p); return error; } EXPORT_SYMBOL_GPL(exportfs_encode_fh); struct dentry * exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context) { const struct export_operations *nop = mnt->mnt_sb->s_export_op; struct dentry *result, *alias; char nbuf[NAME_MAX+1]; int err; /* * Try to get any dentry for the given file handle from the filesystem. */ if (!exportfs_can_decode_fh(nop)) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (IS_ERR_OR_NULL(result)) return result; /* * If no acceptance criteria was specified by caller, a disconnected * dentry is also accepatable. Callers may use this mode to query if * file handle is stale or to get a reference to an inode without * risking the high overhead caused by directory reconnect. */ if (!acceptable) return result; if (d_is_dir(result)) { /* * This request is for a directory. * * On the positive side there is only one dentry for each * directory inode. On the negative side this implies that we * to ensure our dentry is connected all the way up to the * filesystem root. */ if (result->d_flags & DCACHE_DISCONNECTED) { err = reconnect_path(mnt, result, nbuf); if (err) goto err_result; } if (!acceptable(context, result)) { err = -EACCES; goto err_result; } return result; } else { /* * It's not a directory. Life is a little more complicated. */ struct dentry *target_dir, *nresult; /* * See if either the dentry we just got from the filesystem * or any alias for it is acceptable. This is always true * if this filesystem is exported without the subtreecheck * option. If the filesystem is exported with the subtree * check option there's a fair chance we need to look at * the parent directory in the file handle and make sure * it's connected to the filesystem root. */ alias = find_acceptable_alias(result, acceptable, context); if (alias) return alias; /* * Try to extract a dentry for the parent directory from the * file handle. If this fails we'll have to give up. */ err = -ESTALE; if (!nop->fh_to_parent) goto err_result; target_dir = nop->fh_to_parent(mnt->mnt_sb, fid, fh_len, fileid_type); if (!target_dir) goto err_result; err = PTR_ERR(target_dir); if (IS_ERR(target_dir)) goto err_result; /* * And as usual we need to make sure the parent directory is * connected to the filesystem root. The VFS really doesn't * like disconnected directories.. */ err = reconnect_path(mnt, target_dir, nbuf); if (err) { dput(target_dir); goto err_result; } /* * Now that we've got both a well-connected parent and a * dentry for the inode we're after, make sure that our * inode is actually connected to the parent. */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (err) { dput(target_dir); goto err_result; } inode_lock(target_dir->d_inode); nresult = lookup_one(mnt_idmap(mnt), nbuf, target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); nresult = ERR_PTR(-ESTALE); } } inode_unlock(target_dir->d_inode); /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. */ dput(target_dir); if (IS_ERR(nresult)) { err = PTR_ERR(nresult); goto err_result; } dput(result); result = nresult; /* * And finally make sure the dentry is actually acceptable * to NFSD. */ alias = find_acceptable_alias(result, acceptable, context); if (!alias) { err = -EACCES; goto err_result; } return alias; } err_result: dput(result); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw); struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context) { struct dentry *ret; ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, acceptable, context); if (IS_ERR_OR_NULL(ret)) { if (ret == ERR_PTR(-ENOMEM)) return ret; return ERR_PTR(-ESTALE); } return ret; } EXPORT_SYMBOL_GPL(exportfs_decode_fh); MODULE_LICENSE("GPL");
2688 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __ASM_COMPAT_H #define __ASM_COMPAT_H #define COMPAT_UTS_MACHINE "riscv\0\0" /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm-generic/compat.h> static inline int is_compat_task(void) { if (!IS_ENABLED(CONFIG_COMPAT)) return 0; return test_thread_flag(TIF_32BIT); } static inline int is_compat_thread(struct thread_info *thread) { if (!IS_ENABLED(CONFIG_COMPAT)) return 0; return test_ti_thread_flag(thread, TIF_32BIT); } static inline void set_compat_task(bool is_compat) { if (is_compat) set_thread_flag(TIF_32BIT); else clear_thread_flag(TIF_32BIT); } struct compat_user_regs_struct { compat_ulong_t pc; compat_ulong_t ra; compat_ulong_t sp; compat_ulong_t gp; compat_ulong_t tp; compat_ulong_t t0; compat_ulong_t t1; compat_ulong_t t2; compat_ulong_t s0; compat_ulong_t s1; compat_ulong_t a0; compat_ulong_t a1; compat_ulong_t a2; compat_ulong_t a3; compat_ulong_t a4; compat_ulong_t a5; compat_ulong_t a6; compat_ulong_t a7; compat_ulong_t s2; compat_ulong_t s3; compat_ulong_t s4; compat_ulong_t s5; compat_ulong_t s6; compat_ulong_t s7; compat_ulong_t s8; compat_ulong_t s9; compat_ulong_t s10; compat_ulong_t s11; compat_ulong_t t3; compat_ulong_t t4; compat_ulong_t t5; compat_ulong_t t6; }; static inline void regs_to_cregs(struct compat_user_regs_struct *cregs, struct pt_regs *regs) { cregs->pc = (compat_ulong_t) regs->epc; cregs->ra = (compat_ulong_t) regs->ra; cregs->sp = (compat_ulong_t) regs->sp; cregs->gp = (compat_ulong_t) regs->gp; cregs->tp = (compat_ulong_t) regs->tp; cregs->t0 = (compat_ulong_t) regs->t0; cregs->t1 = (compat_ulong_t) regs->t1; cregs->t2 = (compat_ulong_t) regs->t2; cregs->s0 = (compat_ulong_t) regs->s0; cregs->s1 = (compat_ulong_t) regs->s1; cregs->a0 = (compat_ulong_t) regs->a0; cregs->a1 = (compat_ulong_t) regs->a1; cregs->a2 = (compat_ulong_t) regs->a2; cregs->a3 = (compat_ulong_t) regs->a3; cregs->a4 = (compat_ulong_t) regs->a4; cregs->a5 = (compat_ulong_t) regs->a5; cregs->a6 = (compat_ulong_t) regs->a6; cregs->a7 = (compat_ulong_t) regs->a7; cregs->s2 = (compat_ulong_t) regs->s2; cregs->s3 = (compat_ulong_t) regs->s3; cregs->s4 = (compat_ulong_t) regs->s4; cregs->s5 = (compat_ulong_t) regs->s5; cregs->s6 = (compat_ulong_t) regs->s6; cregs->s7 = (compat_ulong_t) regs->s7; cregs->s8 = (compat_ulong_t) regs->s8; cregs->s9 = (compat_ulong_t) regs->s9; cregs->s10 = (compat_ulong_t) regs->s10; cregs->s11 = (compat_ulong_t) regs->s11; cregs->t3 = (compat_ulong_t) regs->t3; cregs->t4 = (compat_ulong_t) regs->t4; cregs->t5 = (compat_ulong_t) regs->t5; cregs->t6 = (compat_ulong_t) regs->t6; }; static inline void cregs_to_regs(struct compat_user_regs_struct *cregs, struct pt_regs *regs) { regs->epc = (unsigned long) cregs->pc; regs->ra = (unsigned long) cregs->ra; regs->sp = (unsigned long) cregs->sp; regs->gp = (unsigned long) cregs->gp; regs->tp = (unsigned long) cregs->tp; regs->t0 = (unsigned long) cregs->t0; regs->t1 = (unsigned long) cregs->t1; regs->t2 = (unsigned long) cregs->t2; regs->s0 = (unsigned long) cregs->s0; regs->s1 = (unsigned long) cregs->s1; regs->a0 = (unsigned long) cregs->a0; regs->a1 = (unsigned long) cregs->a1; regs->a2 = (unsigned long) cregs->a2; regs->a3 = (unsigned long) cregs->a3; regs->a4 = (unsigned long) cregs->a4; regs->a5 = (unsigned long) cregs->a5; regs->a6 = (unsigned long) cregs->a6; regs->a7 = (unsigned long) cregs->a7; regs->s2 = (unsigned long) cregs->s2; regs->s3 = (unsigned long) cregs->s3; regs->s4 = (unsigned long) cregs->s4; regs->s5 = (unsigned long) cregs->s5; regs->s6 = (unsigned long) cregs->s6; regs->s7 = (unsigned long) cregs->s7; regs->s8 = (unsigned long) cregs->s8; regs->s9 = (unsigned long) cregs->s9; regs->s10 = (unsigned long) cregs->s10; regs->s11 = (unsigned long) cregs->s11; regs->t3 = (unsigned long) cregs->t3; regs->t4 = (unsigned long) cregs->t4; regs->t5 = (unsigned long) cregs->t5; regs->t6 = (unsigned long) cregs->t6; }; #endif /* __ASM_COMPAT_H */
4 4 4 2 3 2 3 3 1 3 3 1 3 5 2 3 1 2 3 3 3 3 3 3 4 4 4 4 3 4 4 4 4 4 3 1 3 3 3 5 2 5 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ #include <linux/cred.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { kvfree(group_info); } EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } return 0; } /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; kgid_t kgid; if (get_user(gid, grouplist+i)) return -EFAULT; kgid = make_kgid(user_ns, gid); if (!gid_valid(kgid)) return -EINVAL; group_info->gid[i] = kgid; } return 0; } static int gid_cmp(const void *_a, const void *_b) { kgid_t a = *(kgid_t *)_a; kgid_t b = *(kgid_t *)_b; return gid_gt(a, b) - gid_lt(a, b); } void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; if (!group_info) return 0; left = 0; right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; } return 0; } /** * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install */ void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); get_group_info(group_info); new->group_info = group_info; } EXPORT_SYMBOL(set_groups); /** * set_current_groups - Change current's group subscription * @group_info: The group list to impose * * Validate a group subscription and, if valid, impose it upon current's task * security record. */ int set_current_groups(struct group_info *group_info) { struct cred *new; const struct cred *old; int retval; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); set_groups(new, group_info); retval = security_task_fix_setgroups(new, old); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } EXPORT_SYMBOL(set_current_groups); SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; /* no need to grab task_lock here; it cannot change */ i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: return i; } bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. */ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; group_info = groups_alloc(gidsetsize); if (!group_info) return -ENOMEM; retval = groups_from_user(group_info, grouplist); if (retval) { put_group_info(group_info); return retval; } groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); return retval; } /* * Check whether we're fsgid/egid or in the supplemental group.. */ int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_egroup_p);
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM icmp #if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ICMP_H #include <linux/icmp.h> #include <linux/tracepoint.h> TRACE_EVENT(icmp_send, TP_PROTO(const struct sk_buff *skb, int type, int code), TP_ARGS(skb, type, code), TP_STRUCT__entry( __field(const void *, skbaddr) __field(int, type) __field(int, code) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __field(__u16, sport) __field(__u16, dport) __field(unsigned short, ulen) ), TP_fast_assign( struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = udp_hdr(skb); int proto_4 = iph->protocol; __be32 *p32; __entry->skbaddr = skb; __entry->type = type; __entry->code = code; if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head || (u8 *)uh + sizeof(struct udphdr) > skb_tail_pointer(skb)) { __entry->sport = 0; __entry->dport = 0; __entry->ulen = 0; } else { __entry->sport = ntohs(uh->source); __entry->dport = ntohs(uh->dest); __entry->ulen = ntohs(uh->len); } p32 = (__be32 *) __entry->saddr; *p32 = iph->saddr; p32 = (__be32 *) __entry->daddr; *p32 = iph->daddr; ), TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p", __entry->type, __entry->code, __entry->saddr, __entry->sport, __entry->daddr, __entry->dport, __entry->ulen, __entry->skbaddr) ); #endif /* _TRACE_ICMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_QUEUE_H #define _NF_QUEUE_H #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/skbuff.h> /* Each queued (to userspace) skbuff has one of these. */ struct nf_queue_entry { struct list_head list; struct sk_buff *skb; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct net_device *physin; struct net_device *physout; #endif struct nf_hook_state state; u16 size; /* sizeof(entry) + saved route keys */ /* extra space to store route keys */ }; #define nf_queue_entry_reroute(x) ((void *)x + sizeof(struct nf_queue_entry)) /* Packet queuing */ struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); void (*nf_hook_drop)(struct net *net); }; void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { while (*jhash_initval == 0) *jhash_initval = get_random_u32(); } static inline u32 hash_v4(const struct iphdr *iph, u32 initval) { /* packets in either direction go into same queue */ if ((__force u32)iph->saddr < (__force u32)iph->daddr) return jhash_3words((__force u32)iph->saddr, (__force u32)iph->daddr, iph->protocol, initval); return jhash_3words((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, initval); } static inline u32 hash_v6(const struct ipv6hdr *ip6h, u32 initval) { u32 a, b, c; if ((__force u32)ip6h->saddr.s6_addr32[3] < (__force u32)ip6h->daddr.s6_addr32[3]) { a = (__force u32) ip6h->saddr.s6_addr32[3]; b = (__force u32) ip6h->daddr.s6_addr32[3]; } else { b = (__force u32) ip6h->saddr.s6_addr32[3]; a = (__force u32) ip6h->daddr.s6_addr32[3]; } if ((__force u32)ip6h->saddr.s6_addr32[1] < (__force u32)ip6h->daddr.s6_addr32[1]) c = (__force u32) ip6h->saddr.s6_addr32[1]; else c = (__force u32) ip6h->daddr.s6_addr32[1]; return jhash_3words(a, b, c, initval); } static inline u32 hash_bridge(const struct sk_buff *skb, u32 initval) { struct ipv6hdr *ip6h, _ip6h; struct iphdr *iph, _iph; switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), &_iph); if (iph) return hash_v4(iph, initval); break; case htons(ETH_P_IPV6): ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), &_ip6h); if (ip6h) return hash_v6(ip6h, initval); break; } return 0; } static inline u32 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, u32 initval) { switch (family) { case NFPROTO_IPV4: queue += reciprocal_scale(hash_v4(ip_hdr(skb), initval), queues_total); break; case NFPROTO_IPV6: queue += reciprocal_scale(hash_v6(ipv6_hdr(skb), initval), queues_total); break; case NFPROTO_BRIDGE: queue += reciprocal_scale(hash_bridge(skb, initval), queues_total); break; } return queue; } int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int index, unsigned int verdict); #endif /* _NF_QUEUE_H */
8 7 7 8 8 6 6 6 6 1 5 5 6 5 1 5 54 54 54 54 54 115 53 114 8 115 3 53 52 52 54 8 54 53 52 1 37 37 37 87 37 37 37 37 36 37 37 113 115 112 115 116 108 108 8 113 115 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; /* Limit batching if we have delayed rmaps pending */ if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { struct encoded_page **pages = batch->encoded_pages; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); unsigned int nr_pages = 1; if (unlikely(encoded_page_flags(enc) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages = encoded_nr_pages(pages[++i]); folio_remove_rmap_ptes(page_folio(page), page, nr_pages, vma); } } } /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so * we only need to walk through the current active batch and the * original local one. */ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (!tlb->delayed_rmap) return; tlb_flush_rmap_batch(&tlb->local, vma); if (tlb->active != &tlb->local) tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif /* * We might end up freeing a lot of pages. Reschedule on a regular * basis to avoid soft lockups in configurations without full * preemption enabled. The magic number of 512 folios seems to work. */ #define MAX_NR_FOLIOS_PER_FREE 512 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { struct encoded_page **pages = batch->encoded_pages; unsigned int nr, nr_pages; while (batch->nr) { if (!page_poisoning_enabled_static() && !want_init_on_free()) { nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave * nr_pages behind when capping the number of entries. */ if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; } else { /* * With page poisoning and init_on_free, the time it * takes to free memory grows proportionally with the * actual memory size. Therefore, limit based on the * actual memory size and not the number of involved * folios. */ for (nr = 0, nr_pages = 0; nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; nr++) { if (unlikely(encoded_page_flags(pages[nr]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages += encoded_nr_pages(pages[++nr]); else nr_pages++; } } free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); } } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap, int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ if (likely(nr_pages == 1)) { batch->encoded_pages[batch->nr++] = encode_page(page, flags); } else { flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. */ if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) { return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, PAGE_SIZE); } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Architectures that do not have this (PPC) need to delay the freeing by some * other means, this is that means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } static void tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); }
3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <asm/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/nf_synproxy.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_synproxy.h> unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts) { int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; if (unlikely(length < 0)) return false; ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; opts->options = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return true; case TCPOPT_NOP: length--; continue; default: if (length < 2) return true; opsize = *ptr++; if (opsize < 2) return true; if (opsize > length) return true; switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW) { opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } ptr += opsize - 2; length -= opsize; } } return true; } EXPORT_SYMBOL_GPL(synproxy_parse_options); static unsigned int synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); else *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); static void synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) opts->options |= NF_SYNPROXY_OPT_WSCALE; opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } static unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, struct tcphdr *th, struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); } inet_proto_csum_replace4(&th->check, skb, old, *ptr, false); return 1; } optoff += op[1]; } } return 1; } #ifdef CONFIG_PROC_FS static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } return NULL; } static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } (*pos)++; return NULL; } static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) { return; } static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) { struct synproxy_stats *stats = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries\t\tsyn_received\t" "cookie_invalid\tcookie_valid\t" "cookie_retrans\tconn_reopened\n"); return 0; } seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, stats->syn_received, stats->cookie_invalid, stats->cookie_valid, stats->cookie_retrans, stats->conn_reopened); return 0; } static const struct seq_operations synproxy_cpu_seq_ops = { .start = synproxy_cpu_seq_start, .next = synproxy_cpu_seq_next, .stop = synproxy_cpu_seq_stop, .show = synproxy_cpu_seq_show, }; static int __net_init synproxy_proc_init(struct net *net) { if (!proc_create_net("synproxy", 0444, net->proc_net_stat, &synproxy_cpu_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static void __net_exit synproxy_proc_exit(struct net *net) { remove_proc_entry("synproxy", net->proc_net_stat); } #else static int __net_init synproxy_proc_init(struct net *net) { return 0; } static void __net_exit synproxy_proc_exit(struct net *net) { return; } #endif /* CONFIG_PROC_FS */ static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); struct nf_conn *ct; int err = -ENOMEM; ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL); if (!ct) goto err1; if (!nfct_seqadj_ext_add(ct)) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); if (snet->stats == NULL) goto err2; err = synproxy_proc_init(net); if (err < 0) goto err3; return 0; err3: free_percpu(snet->stats); err2: nf_ct_tmpl_free(ct); err1: return err; } static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } static struct pernet_operations synproxy_net_ops = { .init = synproxy_net_init, .exit = synproxy_net_exit, .id = &synproxy_net_id, .size = sizeof(struct synproxy_net), }; static int __init synproxy_core_init(void) { return register_pernet_subsys(&synproxy_net_ops); } static void __exit synproxy_core_exit(void) { unregister_pernet_subsys(&synproxy_net_ops); } module_init(synproxy_core_init); module_exit(synproxy_core_exit); static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); iph->version = 4; iph->ihl = sizeof(*iph) / 4; iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; iph->daddr = daddr; return iph; } static void synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } ip_local_out(net, nskb->sk, nskb); return; free_nskb: kfree_skb(nskb); } void synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; u16 mss = opts->mss_encode; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE; nth->doff = tcp_hdr_size / 4; nth->window = 0; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } EXPORT_SYMBOL_GPL(synproxy_send_client_synack); static void synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(recv_seq - 1); /* ack_seq is used to relay our ISN to the synproxy hook to initialize * sequence number translation once a connection tracking entry exists. */ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); tcp_flag_word(nth) = TCP_FLAG_SYN; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; nth->doff = tcp_hdr_size / 4; nth->window = th->window; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(ntohl(th->ack_seq)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(ntohl(th->seq) + 1); nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } bool synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v4_check(ip_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; } this_cpu_inc(snet->stats->cookie_valid); opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { struct net *net = nhs->net; struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; struct synproxy_options opts = {}; const struct ip_ct_tcp *state; struct tcphdr *th, _th; unsigned int thoff; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; synproxy = nfct_synproxy(ct); if (!synproxy) return NF_ACCEPT; if (nf_is_loopback_packet(skb) || ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); if (!th) return NF_DROP; state = &ct->proto.tcp; switch (state->state) { case TCP_CONNTRACK_CLOSE: if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq) + 1); break; } if (!th->syn || th->ack || CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) break; /* Reopened connection - reset the sequence number and timestamp * adjustments, they will get initialized once the connection is * reestablished. */ nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } synproxy->isn = ntohl(th->ack_seq); if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) break; if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); } opts.options &= ~(NF_SYNPROXY_OPT_MSS | NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; default: break; } synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); return NF_ACCEPT; } EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); static const struct nf_hook_ops ipv4_synproxy_ops[] = { { .hook = ipv4_synproxy_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv4_synproxy_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, }; int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) { int err; if (snet->hook_ref4 == 0) { err = nf_register_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); if (err) return err; } snet->hook_ref4++; return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) { snet->hook_ref4--; if (snet->hook_ref4 == 0) nf_unregister_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); #if IS_ENABLED(CONFIG_IPV6) static struct ipv6hdr * synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct ipv6hdr *iph; skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit); iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; return iph; } static void synproxy_send_tcp_ipv6(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct ipv6hdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { struct dst_entry *dst; struct flowi6 fl6; int err; nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.saddr = niph->saddr; fl6.daddr = niph->daddr; fl6.fl6_sport = nth->source; fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi_common(&fl6)); err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (err) { goto free_nskb; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) goto free_nskb; skb_dst_set(nskb, dst); if (nfct) { nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } ip6_local_out(net, nskb->sk, nskb); return; free_nskb: kfree_skb(nskb); } void synproxy_send_client_synack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; u16 mss = opts->mss_encode; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE; nth->doff = tcp_hdr_size / 4; nth->window = 0; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); static void synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(recv_seq - 1); /* ack_seq is used to relay our ISN to the synproxy hook to initialize * sequence number translation once a connection tracking entry exists. */ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); tcp_flag_word(nth) = TCP_FLAG_SYN; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; nth->doff = tcp_hdr_size / 4; nth->window = th->window; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(ntohl(th->ack_seq)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(ntohl(th->seq) + 1); nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } bool synproxy_recv_client_ack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = nf_cookie_v6_check(ipv6_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; } this_cpu_inc(snet->stats->cookie_valid); opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); return true; } EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { struct net *net = nhs->net; struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; struct synproxy_options opts = {}; const struct ip_ct_tcp *state; struct tcphdr *th, _th; __be16 frag_off; u8 nexthdr; int thoff; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; synproxy = nfct_synproxy(ct); if (!synproxy) return NF_ACCEPT; if (nf_is_loopback_packet(skb)) return NF_ACCEPT; nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); if (!th) return NF_DROP; state = &ct->proto.tcp; switch (state->state) { case TCP_CONNTRACK_CLOSE: if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq) + 1); break; } if (!th->syn || th->ack || CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) break; /* Reopened connection - reset the sequence number and timestamp * adjustments, they will get initialized once the connection is * reestablished. */ nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } synproxy->isn = ntohl(th->ack_seq); if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) break; if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); } opts.options &= ~(NF_SYNPROXY_OPT_MSS | NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; default: break; } synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); return NF_ACCEPT; } EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); static const struct nf_hook_ops ipv6_synproxy_ops[] = { { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, }; int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { int err; if (snet->hook_ref6 == 0) { err = nf_register_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); if (err) return err; } snet->hook_ref6++; return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) { snet->hook_ref6--; if (snet->hook_ref6 == 0) nf_unregister_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); #endif /* CONFIG_IPV6 */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("nftables SYNPROXY expression support");
1 1 2 1 1 1 1 3 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Lock inodes for any further data updates by setting COMMITTING state * [2] Submit data buffers of all the inodes * [3] Wait for [2] to complete * [4] Commit all the directory entry updates in the fast commit space * [5] Commit all the changed inode structures * [6] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * [7] Wait for [4], [5] and [6] to complete. * * All the inode updates must call ext4_fc_start_update() before starting an * update. If such an ongoing update is present, fast commit waits for it to * complete. The completion of such an update is marked by * ext4_fc_stop_update(). * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Fast commit's commit path locks the entire file system during fast * commit. This has significant performance penalty. Instead of that, we * should use ext4_fc_start/stop_update functions to start inode level * updates from ext4_journal_start/stop. Once we do that we can drop file * system locking during commit path. * * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); } /* This function must be called with sbi->s_fc_lock held. */ static void ext4_fc_wait_committing_inode(struct inode *inode) __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) { wait_queue_head_t *wq; struct ext4_inode_info *ei = EXT4_I(inode); #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); schedule(); finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Inform Ext4's fast about start of an inode update * * This function is called by the high level call VFS callbacks before * performing any inode update. This function blocks if there's an ongoing * fast commit on the inode in question. */ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list)) goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } out: atomic_inc(&ei->i_fc_updates); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } /* * Stop inode update and wake up waiting fast commits if any. */ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) wake_up_all(&ei->i_fc_wait); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); return; } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } if (!list_empty(&ei->i_fc_list)) list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); tid = sbi->s_journal->j_running_transaction ? sbi->s_journal->j_running_transaction->t_tid : 0; read_unlock(&sbi->s_journal->j_state_lock); } spin_lock(&sbi->s_fc_lock); if (sbi->s_fc_ineligible_tid < tid) sbi->s_fc_ineligible_tid = tid; spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); if (!enqueue) return ret; spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, NULL); mutex_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, dentry->d_name.len); } else { memcpy(node->fcd_iname, dentry->d_name.name, dentry->d_name.len); node->fcd_name.name = node->fcd_iname; } node->fcd_name.len = dentry->d_name.len; INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); mutex_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; mutex_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { mutex_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&ei->i_fc_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&ei->i_fc_updates)) { spin_unlock(&sbi->s_fc_lock); schedule(); spin_lock(&sbi->s_fc_lock); } finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return ret; } /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { if (!ext4_test_inode_state(&pos->vfs_inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = jbd2_wait_inode_data(journal, pos->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode pointer */ WARN_ON(list_empty(&fc_dentry->fcd_dilist)); ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); spin_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) goto lock_and_exit; ret = ext4_fc_write_inode_data(inode, crc); if (ret) goto lock_and_exit; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: spin_lock(&sbi->s_fc_lock); return ret; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; ret = ext4_fc_submit_inode_data_all(journal); if (ret) return ret; ret = ext4_fc_wait_inode_data_all(journal); if (ret) return ret; /* * If file system device is different from journal device, issue a cache * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* * Add a head tag only if this is the first fast commit * in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { spin_unlock(&sbi->s_fc_lock); goto out; } list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_tail(sb, crc); out: blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); if (iter->i_sync_tid <= tid) ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; spin_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); if (ret) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); }
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 // SPDX-License-Identifier: GPL-2.0-or-later /* Instantiate a public key crypto key from an X.509 Certificate * * Copyright (C) 2012, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "ASYM: "fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/err.h> #include <crypto/public_key.h> #include "asymmetric_keys.h" static bool use_builtin_keys; static struct asymmetric_key_id *ca_keyid; #ifndef MODULE static struct { struct asymmetric_key_id id; unsigned char data[10]; } cakey; static int __init ca_keys_setup(char *str) { if (!str) /* default system keyring */ return 1; if (strncmp(str, "id:", 3) == 0) { struct asymmetric_key_id *p = &cakey.id; size_t hexlen = (strlen(str) - 3) / 2; int ret; if (hexlen == 0 || hexlen > sizeof(cakey.data)) { pr_err("Missing or invalid ca_keys id\n"); return 1; } ret = __asymmetric_key_hex_to_key_id(str + 3, p, hexlen); if (ret < 0) pr_err("Unparsable ca_keys id hex string\n"); else ca_keyid = p; /* owner key 'id:xxxxxx' */ } else if (strcmp(str, "builtin") == 0) { use_builtin_keys = true; } return 1; } __setup("ca_keys=", ca_keys_setup); #endif /** * restrict_link_by_signature - Restrict additions to a ring of public keys * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @trust_keyring: A ring of keys that can be used to vouch for the new cert. * * Check the new certificate against the ones in the trust keyring. If one of * those is the signing key and validates the new certificate, then mark the * new certificate as being trusted. * * Returns 0 if the new certificate was accepted, -ENOKEY if we couldn't find a * matching parent certificate in the trusted list, -EKEYREJECTED if the * signature check fails or the key is blacklisted, -ENOPKG if the signature * uses unsupported crypto, or some other error if there is a matching * certificate but the signature check cannot be performed. */ int restrict_link_by_signature(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trust_keyring) { const struct public_key_signature *sig; struct key *key; int ret; pr_devel("==>%s()\n", __func__); if (!trust_keyring) return -ENOKEY; if (type != &key_type_asymmetric) return -EOPNOTSUPP; sig = payload->data[asym_auth]; if (!sig) return -ENOPKG; if (!sig->auth_ids[0] && !sig->auth_ids[1] && !sig->auth_ids[2]) return -ENOKEY; if (ca_keyid && !asymmetric_key_id_partial(sig->auth_ids[1], ca_keyid)) return -EPERM; /* See if we have a key that signed this one. */ key = find_asymmetric_key(trust_keyring, sig->auth_ids[0], sig->auth_ids[1], sig->auth_ids[2], false); if (IS_ERR(key)) return -ENOKEY; if (use_builtin_keys && !test_bit(KEY_FLAG_BUILTIN, &key->flags)) ret = -ENOKEY; else if (IS_BUILTIN(CONFIG_SECONDARY_TRUSTED_KEYRING_SIGNED_BY_BUILTIN) && !strcmp(dest_keyring->description, ".secondary_trusted_keys") && !test_bit(KEY_FLAG_BUILTIN, &key->flags)) ret = -ENOKEY; else ret = verify_signature(key, sig); key_put(key); return ret; } /** * restrict_link_by_ca - Restrict additions to a ring of CA keys * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @trust_keyring: Unused. * * Check if the new certificate is a CA. If it is a CA, then mark the new * certificate as being ok to link. * * Returns 0 if the new certificate was accepted, -ENOKEY if the * certificate is not a CA. -ENOPKG if the signature uses unsupported * crypto, or some other error if there is a matching certificate but * the signature check cannot be performed. */ int restrict_link_by_ca(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trust_keyring) { const struct public_key *pkey; if (type != &key_type_asymmetric) return -EOPNOTSUPP; pkey = payload->data[asym_crypto]; if (!pkey) return -ENOPKG; if (!test_bit(KEY_EFLAG_CA, &pkey->key_eflags)) return -ENOKEY; if (!test_bit(KEY_EFLAG_KEYCERTSIGN, &pkey->key_eflags)) return -ENOKEY; if (!IS_ENABLED(CONFIG_INTEGRITY_CA_MACHINE_KEYRING_MAX)) return 0; if (test_bit(KEY_EFLAG_DIGITALSIG, &pkey->key_eflags)) return -ENOKEY; return 0; } /** * restrict_link_by_digsig - Restrict additions to a ring of digsig keys * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @trust_keyring: A ring of keys that can be used to vouch for the new cert. * * Check if the new certificate has digitalSignature usage set. If it is, * then mark the new certificate as being ok to link. Afterwards verify * the new certificate against the ones in the trust_keyring. * * Returns 0 if the new certificate was accepted, -ENOKEY if the * certificate is not a digsig. -ENOPKG if the signature uses unsupported * crypto, or some other error if there is a matching certificate but * the signature check cannot be performed. */ int restrict_link_by_digsig(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trust_keyring) { const struct public_key *pkey; if (type != &key_type_asymmetric) return -EOPNOTSUPP; pkey = payload->data[asym_crypto]; if (!pkey) return -ENOPKG; if (!test_bit(KEY_EFLAG_DIGITALSIG, &pkey->key_eflags)) return -ENOKEY; if (test_bit(KEY_EFLAG_CA, &pkey->key_eflags)) return -ENOKEY; if (test_bit(KEY_EFLAG_KEYCERTSIGN, &pkey->key_eflags)) return -ENOKEY; return restrict_link_by_signature(dest_keyring, type, payload, trust_keyring); } static bool match_either_id(const struct asymmetric_key_id **pair, const struct asymmetric_key_id *single) { return (asymmetric_key_id_same(pair[0], single) || asymmetric_key_id_same(pair[1], single)); } static int key_or_keyring_common(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trusted, bool check_dest) { const struct public_key_signature *sig; struct key *key = NULL; int ret; pr_devel("==>%s()\n", __func__); if (!dest_keyring) return -ENOKEY; else if (dest_keyring->type != &key_type_keyring) return -EOPNOTSUPP; if (!trusted && !check_dest) return -ENOKEY; if (type != &key_type_asymmetric) return -EOPNOTSUPP; sig = payload->data[asym_auth]; if (!sig) return -ENOPKG; if (!sig->auth_ids[0] && !sig->auth_ids[1] && !sig->auth_ids[2]) return -ENOKEY; if (trusted) { if (trusted->type == &key_type_keyring) { /* See if we have a key that signed this one. */ key = find_asymmetric_key(trusted, sig->auth_ids[0], sig->auth_ids[1], sig->auth_ids[2], false); if (IS_ERR(key)) key = NULL; } else if (trusted->type == &key_type_asymmetric) { const struct asymmetric_key_id **signer_ids; signer_ids = (const struct asymmetric_key_id **) asymmetric_key_ids(trusted)->id; /* * The auth_ids come from the candidate key (the * one that is being considered for addition to * dest_keyring) and identify the key that was * used to sign. * * The signer_ids are identifiers for the * signing key specified for dest_keyring. * * The first auth_id is the preferred id, 2nd and * 3rd are the fallbacks. If exactly one of * auth_ids[0] and auth_ids[1] is present, it may * match either signer_ids[0] or signed_ids[1]. * If both are present the first one may match * either signed_id but the second one must match * the second signer_id. If neither of them is * available, auth_ids[2] is matched against * signer_ids[2] as a fallback. */ if (!sig->auth_ids[0] && !sig->auth_ids[1]) { if (asymmetric_key_id_same(signer_ids[2], sig->auth_ids[2])) key = __key_get(trusted); } else if (!sig->auth_ids[0] || !sig->auth_ids[1]) { const struct asymmetric_key_id *auth_id; auth_id = sig->auth_ids[0] ?: sig->auth_ids[1]; if (match_either_id(signer_ids, auth_id)) key = __key_get(trusted); } else if (asymmetric_key_id_same(signer_ids[1], sig->auth_ids[1]) && match_either_id(signer_ids, sig->auth_ids[0])) { key = __key_get(trusted); } } else { return -EOPNOTSUPP; } } if (check_dest && !key) { /* See if the destination has a key that signed this one. */ key = find_asymmetric_key(dest_keyring, sig->auth_ids[0], sig->auth_ids[1], sig->auth_ids[2], false); if (IS_ERR(key)) key = NULL; } if (!key) return -ENOKEY; ret = key_validate(key); if (ret == 0) ret = verify_signature(key, sig); key_put(key); return ret; } /** * restrict_link_by_key_or_keyring - Restrict additions to a ring of public * keys using the restrict_key information stored in the ring. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @trusted: A key or ring of keys that can be used to vouch for the new cert. * * Check the new certificate only against the key or keys passed in the data * parameter. If one of those is the signing key and validates the new * certificate, then mark the new certificate as being ok to link. * * Returns 0 if the new certificate was accepted, -ENOKEY if we * couldn't find a matching parent certificate in the trusted list, * -EKEYREJECTED if the signature check fails, -ENOPKG if the signature uses * unsupported crypto, or some other error if there is a matching certificate * but the signature check cannot be performed. */ int restrict_link_by_key_or_keyring(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trusted) { return key_or_keyring_common(dest_keyring, type, payload, trusted, false); } /** * restrict_link_by_key_or_keyring_chain - Restrict additions to a ring of * public keys using the restrict_key information stored in the ring. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @trusted: A key or ring of keys that can be used to vouch for the new cert. * * Check the new certificate against the key or keys passed in the data * parameter and against the keys already linked to the destination keyring. If * one of those is the signing key and validates the new certificate, then mark * the new certificate as being ok to link. * * Returns 0 if the new certificate was accepted, -ENOKEY if we * couldn't find a matching parent certificate in the trusted list, * -EKEYREJECTED if the signature check fails, -ENOPKG if the signature uses * unsupported crypto, or some other error if there is a matching certificate * but the signature check cannot be performed. */ int restrict_link_by_key_or_keyring_chain(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *trusted) { return key_or_keyring_common(dest_keyring, type, payload, trusted, true); }
2 3 3 2 2 3 2 2 5 5 5 5 5 3 3 3 3 3 2 3 3 15 16 16 12 12 8 5 2 2 2 2 2 1 1 2 5 1 1 1 1 1 5 5 5 5 14 14 15 15 3 3 15 15 15 3 15 2 14 2 15 1 15 15 3 2 2 1 1 2 14 14 2 14 2 14 2 14 1 14 1 13 1 1 1 1 11 10 2 13 2 2 1 2 2 15 1 1 1 1 2 2 1 1 1 1 1 1 3 3 3 3 3 3 1 1 1 1 1 4 4 4 4 4 4 4 1 3 4 1 4 4 16 16 4 12 4 16 16 6 10 16 16 1 16 16 16 16 16 15 4 11 16 16 16 9 2 10 3 2 1 1 1 1 1 8 8 8 11 11 11 11 9 9 9 9 9 8 8 1 8 8 1 1 4 11 3 2 2 1 1 2 2 2 2 3 3 4 4 4 4 4 4 3 3 3 2 2 2 1 1 1 19 6 13 7 6 6 2 1 3 2 1 1 7 5 5 1 4 4 1 3 3 4 3 2 4 4 4 4 4 4 2 2 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) /* This mutex is used to * - protect race between prog/link attach/detach and link prog update, and * - protect race between releasing and accessing map in bpf_link. * A single global mutex lock is used since it is expected contention is low. */ static DEFINE_MUTEX(sockmap_mutex); static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); fdput(f); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto put_map; } if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: fdput(f); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk; int err = 0; spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **cur_pprog; struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: cur_pprog = &progs->msg_parser; cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: cur_pprog = &progs->stream_parser; cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; cur_pprog = &progs->stream_verdict; cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; cur_pprog = &progs->skb_verdict; cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } *pprog = cur_pprog; if (plink) *plink = cur_plink; return 0; } /* Handle the following four cases: * prog_attach: prog != NULL, old == NULL, link == NULL * prog_detach: prog == NULL, old != NULL, link == NULL * link_attach: prog != NULL, old == NULL, link != NULL * link_detach: prog == NULL, old != NULL, link != NULL */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which) { struct bpf_prog **pprog; struct bpf_link **plink; int ret; ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; /* for prog_attach/prog_detach/link_attach, return error if a bpf_link * exists for that prog. */ if ((!link || prog) && *plink) return -EBUSY; if (old) { ret = psock_replace_prog(pprog, prog, old); if (!ret) *plink = NULL; } else { psock_set_prog(pprog, prog); if (link) *plink = link; } return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; fdput(f); return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); psock = sk_psock_get(sk); if (unlikely(!psock)) goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } else { saved_close = READ_ONCE(sk->sk_prot)->close; no_psock: rcu_read_unlock(); release_sock(sk); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; enum bpf_attach_type attach_type; }; static void sock_map_link_release(struct bpf_link *link) { struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); mutex_lock(&sockmap_mutex); if (!sockmap_link->map) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, sockmap_link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; out: mutex_unlock(&sockmap_mutex); } static int sock_map_link_detach(struct bpf_link *link) { sock_map_link_release(link); return 0; } static void sock_map_link_dealloc(struct bpf_link *link) { kfree(link); } /* Handle the following two cases: * case 1: link != NULL, prog != NULL, old != NULL * case 2: link != NULL, prog != NULL, old == NULL */ static int sock_map_link_update_prog(struct bpf_link *link, struct bpf_prog *prog, struct bpf_prog *old) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); struct bpf_prog **pprog, *old_link_prog; struct bpf_link **plink; int ret = 0; mutex_lock(&sockmap_mutex); /* If old prog is not NULL, ensure old prog is the same as link->prog. */ if (old && link->prog != old) { ret = -EPERM; goto out; } /* Ensure link->prog has the same type/attach_type as the new prog. */ if (link->prog->type != prog->type || link->prog->expected_attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, sockmap_link->attach_type); if (ret) goto out; /* return error if the stored bpf_link does not match the incoming bpf_link. */ if (link != *plink) { ret = -EBUSY; goto out; } if (old) { ret = psock_replace_prog(pprog, prog, old); if (ret) goto out; } else { psock_set_prog(pprog, prog); } bpf_prog_inc(prog); old_link_prog = xchg(&link->prog, prog); bpf_prog_put(old_link_prog); out: mutex_unlock(&sockmap_mutex); return ret; } static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) { u32 map_id = 0; mutex_lock(&sockmap_mutex); if (sockmap_link->map) map_id = sockmap_link->map->id; mutex_unlock(&sockmap_mutex); return map_id; } static int sock_map_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; info->sockmap.attach_type = sockmap_link->attach_type; return 0; } static void sock_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { .release = sock_map_link_release, .dealloc = sock_map_link_dealloc, .detach = sock_map_link_detach, .update_prog = sock_map_link_update_prog, .fill_link_info = sock_map_link_fill_info, .show_fdinfo = sock_map_link_show_fdinfo, }; int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct sockmap_link *sockmap_link; enum bpf_attach_type attach_type; struct bpf_map *map; int ret; if (attr->link_create.flags) return -EINVAL; map = bpf_map_get_with_uref(attr->link_create.target_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { ret = -EINVAL; goto out; } sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); if (!sockmap_link) { ret = -ENOMEM; goto out; } attach_type = attr->link_create.attach_type; bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); sockmap_link->map = map; sockmap_link->attach_type = attach_type; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { kfree(sockmap_link); goto out; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); mutex_unlock(&sockmap_mutex); if (ret) { bpf_link_cleanup(&link_primer); goto out; } /* Increase refcnt for the prog since when old prog is replaced with * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. * * Actually, we do not need to increase refcnt for the prog since bpf_link * will hold a reference. But in order to have less complexity w.r.t. * replacing/setting prog, let us increase the refcnt to make things simpler. */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out: bpf_map_put_with_uref(map); return ret; } static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init);
5 5 4 4 4 4 1 1 1 1 1 1 1 1 1 1 4 3 4 4 5 4 3 2 4 1 1 3 4 4 4 4 5 3 2 2 2 1 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 1 1 2 2 2 2 4 4 3 2 1 1 1 1 3 2 3 2 1 3 3 1 1 3 3 3 3 3 3 3 1 3 3 1 3 3 3 5 4 2 6 3 5 5 5 5 5 5 5 4 5 5 5 5 5 5 1 3 4 4 4 1 1 4 5 5 5 1 1 1 1 1 1 5 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 /* * af_llc.c - LLC User Interface SAPs * Description: * Functions in this module are implementation of socket based llc * communications for the Linux operating system. Support of llc class * one and class two is provided via SOCK_DGRAM and SOCK_STREAM * respectively. * * An llc2 connection is (mac + sap), only one llc2 sap connection * is allowed per mac. Though one sap may have multiple mac + sap * connections. * * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org> * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> #include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) do {} while (0) #endif /* Maybe we'll add some more in the future. */ #define LLC_CMSG_PKTINFO 1 /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. * * Return the next unused link number for a given sap. */ static inline u16 llc_ui_next_link_no(int sap) { return llc_ui_sap_link_no_max[sap]++; } /** * llc_proto_type - return eth protocol for ARP header type * @arphrd: ARP header type. * * Given an ARP header type return the corresponding ethernet protocol. */ static inline __be16 llc_proto_type(u16 arphrd) { return htons(ETH_P_802_2); } /** * llc_ui_addr_null - determines if a address structure is null * @addr: Address to test if null. */ static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) { return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); } /** * llc_ui_header_len - return length of llc header based on operation * @sk: Socket which contains a valid llc socket type. * @addr: Complete sockaddr_llc structure received from the user. * * Provide the length of the llc header depending on what kind of * operation the user would like to perform and the type of socket. * Returns the correct llc header length. */ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; if (addr->sllc_test) rc = LLC_PDU_LEN_U; else if (addr->sllc_xid) /* We need to expand header to sizeof(struct llc_xid_info) * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header * as XID PDU. In llc_ui_sendmsg() we reserved header size and then * filled all other space with user data. If we won't reserve this * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data */ rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; } /** * llc_ui_send_data - send data via reliable llc2 connection * @sk: Connection the socket is using. * @skb: Data the user wishes to send. * @noblock: can we block waiting for data? * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. * * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); if (rc) { kfree_skb(skb); return rc; } } return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) { sock_graft(sk, sock); sk->sk_type = sock->type; sock->ops = &llc_ui_ops; } static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** * llc_ui_create - alloc and init a new llc_ui socket * @net: network namespace (must be default network) * @sock: Socket to initialize and attach allocated sk to. * @protocol: Unused. * @kern: on behalf of kernel or userspace * * Allocate and initialize a new llc_ui socket, validate the user wants a * socket type we have available. * Returns 0 upon success, negative upon failure. */ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc = -ESOCKTNOSUPPORT; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); } } return rc; } /** * llc_ui_release - shutdown socket * @sock: Socket to release. * * Shutdown and deallocate an existing socket. */ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; if (unlikely(sk == NULL)) goto out; sock_hold(sk); lock_sock(sk); llc = llc_sk(sk); dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; /* Hold this for release_sock(), so that llc_backlog_rcv() * could still use it. */ llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); release_sock(sk); llc_sap_put(sap); } else { release_sock(sk); } netdev_put(llc->dev, &llc->dev_tracker); sock_put(sk); sock_orphan(sk); sock->sk = NULL; llc_sk_free(sk); out: return 0; } /** * llc_ui_autoport - provide dynamically allocate SAP number * * Provide the caller with a dynamically allocated SAP number according * to the rules that are set in this function. Returns: 0, upon failure, * SAP number otherwise. */ static int llc_ui_autoport(void) { struct llc_sap *sap; int i, tries = 0; while (tries < LLC_SAP_DYN_TRIES) { for (i = llc_ui_sap_last_autoport; i < LLC_SAP_DYN_STOP; i += 2) { sap = llc_sap_find(i); if (!sap) { llc_ui_sap_last_autoport = i + 2; goto out; } llc_sap_put(sap); } llc_ui_sap_last_autoport = LLC_SAP_DYN_START; tries++; } i = 0; out: return i; } /** * llc_ui_autobind - automatically bind a socket to a sap * @sock: socket to bind * @addr: address to connect to * * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't * specifically used llc_ui_bind to bind to an specific address/sap * * Returns: 0 upon success, negative otherwise. */ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (addr->sllc_arphrd != ARPHRD_ETHER) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); if (dev && addr->sllc_arphrd != dev->type) { dev_put(dev); dev = NULL; } } else dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); if (!llc->laddr.lsap) goto out; rc = -EBUSY; /* some other network layer is using the sap */ sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: dev_put(dev); return rc; } /** * llc_ui_bind - bind a socket to a specific address. * @sock: Socket to bind an address to. * @uaddr: Address the user wants the socket bound to. * @addrlen: Length of the uaddr structure. * * Bind a socket to a specific address. For llc a user is able to bind to * a specific sap only or mac + sap. * If the user desires to bind to a specific mac + sap, it is possible to * have multiple sap connections via multiple macs. * Bind and autobind for that matter must enforce the correct sap usage * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, dev->dev_addr)) { rc = -EINVAL; dev = NULL; } } } else { dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); } dev_hold(dev); rcu_read_unlock(); if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; addr->sllc_sap = llc_ui_autoport(); if (!addr->sllc_sap) goto out; } sap = llc_sap_find(addr->sllc_sap); if (!sap) { sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; } else { struct llc_addr laddr, daddr; struct sock *ask; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); /* * FIXME: check if the address is multicast, * only SOCK_DGRAM can do this. */ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; } } /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out_put: llc_sap_put(sap); out: dev_put(dev); release_sock(sk); return rc; } /** * llc_ui_shutdown - shutdown a connect llc2 socket. * @sock: Socket to shutdown. * @how: What part of the socket to shutdown. * * Shutdown a connected llc2 socket. Currently this function only supports * shutting down both sends and receives (2), we could probably make this * function such that a user can shutdown only half the connection but not * right now. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int rc = -ENOTCONN; lock_sock(sk); if (unlikely(sk->sk_state != TCP_ESTABLISHED)) goto out; rc = -EINVAL; if (how != 2) goto out; rc = llc_send_disc(sk); if (!rc) rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: release_sock(sk); return rc; } /** * llc_ui_connect - Connect to a remote llc2 mac + sap. * @sock: Socket which will be connected to the remote destination. * @uaddr: Remote and possibly the local address of the new connection. * @addrlen: Size of uaddr structure. * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the * destination mac and address to connect to. If the user hasn't previously * called bind(2) with a smac the address of the first interface of the * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; int rc = -EINVAL; lock_sock(sk); if (unlikely(addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EALREADY; if (unlikely(sock->state == SS_CONNECTING)) goto out; /* bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } llc->daddr.lsap = addr->sllc_sap; memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, llc->dev->dev_addr, addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); sock->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; goto out; } if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) goto out; rc = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } if (sk->sk_state == TCP_CLOSE) goto sock_error; sock->state = SS_CONNECTED; rc = 0; out: release_sock(sk); return rc; sock_error: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; } /** * llc_ui_listen - allow a normal socket to accept incoming connections * @sock: Socket to allow incoming connections on. * @backlog: Number of connections to queue. * * Allow a normal socket to accept incoming connections. * Returns 0 upon success, negative otherwise. */ static int llc_ui_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EINVAL; lock_sock(sk); if (unlikely(sock->state != SS_UNCONNECTED)) goto out; rc = -EOPNOTSUPP; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EAGAIN; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = 0; if (!(unsigned int)backlog) /* BSDism */ backlog = 1; sk->sk_max_ack_backlog = backlog; if (sk->sk_state != TCP_LISTEN) { sk->sk_ack_backlog = 0; sk->sk_state = TCP_LISTEN; } sk->sk_socket->flags |= __SO_ACCEPTCON; out: release_sock(sk); return rc; } static int llc_ui_wait_for_disc(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return timeout; } static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct llc_sock *llc = llc_sk(sk); int rc; add_wait_queue(sk_sleep(sk), &wait); while (1) { rc = 0; if (sk_wait_event(sk, &timeout, (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int llc_wait_data(struct sock *sk, long timeo) { int rc; while (1) { /* * POSIX 1003.1g mandates this order. */ rc = sock_error(sk); if (rc) break; rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -EAGAIN; if (!timeo) break; rc = sock_intr_errno(timeo); if (signal_pending(current)) break; rc = 0; if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; } static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(skb->sk); if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); } } /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ static int llc_ui_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; struct sk_buff *skb; int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __func__, llc_sk(sk)->laddr.lsap); lock_sock(sk); if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EINVAL; if (unlikely(sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)) goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { rc = llc_wait_data(sk, sk->sk_rcvtimeo); if (rc) goto out; } dprintk("%s: got a new connection on %02X\n", __func__, llc_sk(sk)->laddr.lsap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto frees; rc = 0; newsk = skb->sk; /* attach connection to a new socket. */ llc_ui_sk_init(newsock, newsk); sock_reset_flag(newsk, SOCK_ZAPPED); newsk->sk_state = TCP_ESTABLISHED; newsock->state = SS_CONNECTED; llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: release_sock(sk); return rc; } /** * llc_ui_recvmsg - copy received data to the socket user. * @sock: Socket to copy data from. * @msg: Various user space related information. * @len: Size of user buffer. * @flags: User specified flags. * * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); seq = &llc->copied_seq; if (flags & MSG_PEEK) { peek_seq = llc->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); copied = 0; do { u32 offset; /* * We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesn't impact 1003.1g * and move it down to the bottom of the loop */ if (signal_pending(current)) { if (copied) break; copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); if (skb) { offset = *seq; goto found_ok_skb; } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || (flags & MSG_PEEK)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } break; } if (!timeo) { copied = -EAGAIN; break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; found_ok_skb: skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; if (!(flags & MSG_TRUNC)) { int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } *seq += used; copied += used; len -= used; /* For non stream protcols we get one packet per recvmsg call */ if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } /* Partial read */ if (used + offset < skb_len) continue; } while (len > 0); out: release_sock(sk); return copied; copy_uaddr: if (uaddr != NULL && skb != NULL) { memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } goto out; } /** * llc_ui_sendmsg - Transmit data provided by the socket user. * @sock: Socket to transmit data from. * @msg: Various user related information. * @len: Length of data to transmit. * * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; struct net_device *dev; size_t size = 0; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) goto out; } else { if (llc_ui_addr_null(&llc->addr)) goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } dev = llc->dev; hh_len = LL_RESERVED_SPACE(dev); hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; if (sock_flag(sk, SOCK_ZAPPED) || llc->dev != dev || hdrlen != llc_ui_header_len(sk, addr) || hh_len != LL_RESERVED_SPACE(dev) || size > READ_ONCE(dev->mtu)) goto out; skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); skb = NULL; out: kfree_skb(skb); if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); release_sock(sk); return rc ? : copied; } /** * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. * @peer: Does user want local or remote address information. * * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; sllc.sllc_sap = llc->daddr.lsap; memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; memcpy(&sllc.sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); } } sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); rc = sizeof(sllc); out: release_sock(sk); return rc; } /** * llc_ui_ioctl - io controls for PF_LLC * @sock: Socket to get/set info * @cmd: command * @arg: optional argument for cmd * * get/set info on llc sockets */ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -ENOIOCTLCMD; } /** * llc_ui_setsockopt - set various connection specific parameters. * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); unsigned int opt; int rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; rc = copy_from_sockptr(&opt, optval, sizeof(opt)); if (rc) goto out; rc = -EINVAL; switch (optname) { case LLC_OPT_RETRY: if (opt > LLC_OPT_MAX_RETRY) goto out; llc->n2 = opt; break; case LLC_OPT_SIZE: if (opt > LLC_OPT_MAX_SIZE) goto out; llc->n1 = opt; break; case LLC_OPT_ACK_TMR_EXP: if (opt > LLC_OPT_MAX_ACK_TMR_EXP) goto out; llc->ack_timer.expire = opt * HZ; break; case LLC_OPT_P_TMR_EXP: if (opt > LLC_OPT_MAX_P_TMR_EXP) goto out; llc->pf_cycle_timer.expire = opt * HZ; break; case LLC_OPT_REJ_TMR_EXP: if (opt > LLC_OPT_MAX_REJ_TMR_EXP) goto out; llc->rej_sent_timer.expire = opt * HZ; break; case LLC_OPT_BUSY_TMR_EXP: if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) goto out; llc->busy_state_timer.expire = opt * HZ; break; case LLC_OPT_TX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->k = opt; break; case LLC_OPT_RX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->rw = opt; break; case LLC_OPT_PKTINFO: if (opt) llc->cmsg_flags |= LLC_CMSG_PKTINFO; else llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; out: release_sock(sk); return rc; } /** * llc_ui_getsockopt - get connection specific socket info * @sock: Socket to get information from. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: Variable to return operation data in. * @optlen: Length of optval. * * Get connection specific socket information. */ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int val = 0, len = 0, rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC)) goto out; rc = get_user(len, optlen); if (rc) goto out; rc = -EINVAL; if (len != sizeof(int)) goto out; switch (optname) { case LLC_OPT_RETRY: val = llc->n2; break; case LLC_OPT_SIZE: val = llc->n1; break; case LLC_OPT_ACK_TMR_EXP: val = llc->ack_timer.expire / HZ; break; case LLC_OPT_P_TMR_EXP: val = llc->pf_cycle_timer.expire / HZ; break; case LLC_OPT_REJ_TMR_EXP: val = llc->rej_sent_timer.expire / HZ; break; case LLC_OPT_BUSY_TMR_EXP: val = llc->busy_state_timer.expire / HZ; break; case LLC_OPT_TX_WIN: val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; case LLC_OPT_PKTINFO: val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; if (put_user(len, optlen) || copy_to_user(optval, &val, len)) rc = -EFAULT; out: release_sock(sk); return rc; } static const struct net_proto_family llc_ui_family_ops = { .family = PF_LLC, .create = llc_ui_create, .owner = THIS_MODULE, }; static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, .bind = llc_ui_bind, .connect = llc_ui_connect, .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, .setsockopt = llc_ui_setsockopt, .getsockopt = llc_ui_getsockopt, .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, }; static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) { int rc = proto_register(&llc_proto, 0); if (rc != 0) goto out; llc_build_offset_table(); llc_station_init(); llc_ui_sap_last_autoport = LLC_SAP_DYN_START; rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); goto out_station; } rc = llc_sysctl_init(); if (rc) { printk(llc_sysctl_err_msg); goto out_proc; } rc = sock_register(&llc_ui_family_ops); if (rc) { printk(llc_sock_err_msg); goto out_sysctl; } llc_add_pack(LLC_DEST_SAP, llc_sap_handler); llc_add_pack(LLC_DEST_CONN, llc_conn_handler); out: return rc; out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); out_station: llc_station_exit(); proto_unregister(&llc_proto); goto out; } static void __exit llc2_exit(void) { llc_station_exit(); llc_remove_pack(LLC_DEST_SAP); llc_remove_pack(LLC_DEST_CONN); sock_unregister(PF_LLC); llc_proc_exit(); llc_sysctl_exit(); proto_unregister(&llc_proto); } module_init(llc2_init); module_exit(llc2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); MODULE_ALIAS_NETPROTO(PF_LLC);
5 5 5 5 5 4 5 1 1 1 4 4 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #ifndef __NET_TC_SKBEDIT_H #define __NET_TC_SKBEDIT_H #include <net/act_api.h> #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { u32 flags; u32 priority; u32 mark; u32 mask; u16 queue_mapping; u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; struct tcf_skbedit { struct tc_action common; struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is the one identified by FLAG. */ static inline bool is_tcf_skbedit_with_flag(const struct tc_action *a, u32 flag) { #ifdef CONFIG_NET_CLS_ACT u32 flags; if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); return flags == flag; } #endif return false; } /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_MARK); } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { u32 mark; rcu_read_lock(); mark = rcu_dereference(to_skbedit(a)->params)->mark; rcu_read_unlock(); return mark; } /* Return true iff action is ptype */ static inline bool is_tcf_skbedit_ptype(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PTYPE); } static inline u32 tcf_skbedit_ptype(const struct tc_action *a) { u16 ptype; rcu_read_lock(); ptype = rcu_dereference(to_skbedit(a)->params)->ptype; rcu_read_unlock(); return ptype; } /* Return true iff action is priority */ static inline bool is_tcf_skbedit_priority(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PRIORITY); } static inline u32 tcf_skbedit_priority(const struct tc_action *a) { u32 priority; rcu_read_lock(); priority = rcu_dereference(to_skbedit(a)->params)->priority; rcu_read_unlock(); return priority; } static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { u16 rx_queue; rcu_read_lock(); rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; rcu_read_unlock(); return rx_queue; } /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } /* Return true if action is on ingress traffic */ static inline bool is_tcf_skbedit_ingress(u32 flags) { return flags & TCA_ACT_FLAGS_AT_INGRESS; } static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && !is_tcf_skbedit_ingress(a->tcfa_flags); } static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && is_tcf_skbedit_ingress(a->tcfa_flags); } /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); } #endif /* __NET_TC_SKBEDIT_H */
20 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } #endif
37 40 48 216 142 1 220 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #ifndef _LINUX_SIPHASH_H #define _LINUX_SIPHASH_H #include <linux/types.h> #include <linux/kernel.h> #define SIPHASH_ALIGNMENT __alignof__(u64) typedef struct { u64 key[2]; } siphash_key_t; #define siphash_aligned_key_t siphash_key_t __aligned(16) static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); u64 siphash_3u64(const u64 a, const u64 b, const u64 c, const siphash_key_t *key); u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, const siphash_key_t *key); u64 siphash_1u32(const u32 a, const siphash_key_t *key); u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t *key); static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t *key) { return siphash_1u64((u64)b << 32 | a, key); } static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const siphash_key_t *key) { return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); } static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return siphash_1u32(le32_to_cpup((const __le32 *)data), key); if (__builtin_constant_p(len) && len == 8) return siphash_1u64(le64_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 16) return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 24) return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 32) return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), le64_to_cpu(data[3]), key); return __siphash_aligned(data, len, key); } /** * siphash - compute 64-bit siphash PRF value * @data: buffer to hash * @size: size of @data * @key: the siphash key */ static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); return ___siphash_aligned(data, len, key); } #define HSIPHASH_ALIGNMENT __alignof__(unsigned long) typedef struct { unsigned long key[2]; } hsiphash_key_t; u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, const hsiphash_key_t *key); u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const hsiphash_key_t *key); static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return hsiphash_1u32(le32_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 8) return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 12) return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 16) return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), le32_to_cpu(data[3]), key); return __hsiphash_aligned(data, len, key); } /** * hsiphash - compute 32-bit hsiphash PRF value * @data: buffer to hash * @size: size of @data * @key: the hsiphash key */ static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); return ___hsiphash_aligned(data, len, key); } /* * These macros expose the raw SipHash and HalfSipHash permutations. * Do not use them directly! If you think you have a use for them, * be sure to CC the maintainer of this file explaining why. */ #define SIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \ (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \ (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \ (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32)) #define SIPHASH_CONST_0 0x736f6d6570736575ULL #define SIPHASH_CONST_1 0x646f72616e646f6dULL #define SIPHASH_CONST_2 0x6c7967656e657261ULL #define SIPHASH_CONST_3 0x7465646279746573ULL #define HSIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \ (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \ (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \ (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16)) #define HSIPHASH_CONST_0 0U #define HSIPHASH_CONST_1 0U #define HSIPHASH_CONST_2 0x6c796765U #define HSIPHASH_CONST_3 0x74656462U #endif /* _LINUX_SIPHASH_H */
19 23 13 13 11 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_FRAMEBUFFER_H__ #define __DRM_FRAMEBUFFER_H__ #include <linux/ctype.h> #include <linux/list.h> #include <linux/sched.h> #include <drm/drm_fourcc.h> #include <drm/drm_mode_object.h> struct drm_clip_rect; struct drm_device; struct drm_file; struct drm_framebuffer; struct drm_gem_object; /** * struct drm_framebuffer_funcs - framebuffer hooks */ struct drm_framebuffer_funcs { /** * @destroy: * * Clean up framebuffer resources, specifically also unreference the * backing storage. The core guarantees to call this function for every * framebuffer successfully created by calling * &drm_mode_config_funcs.fb_create. Drivers must also call * drm_framebuffer_cleanup() to release DRM core resources for this * framebuffer. */ void (*destroy)(struct drm_framebuffer *framebuffer); /** * @create_handle: * * Create a buffer handle in the driver-specific buffer manager (either * GEM or TTM) valid for the passed-in &struct drm_file. This is used by * the core to implement the GETFB IOCTL, which returns (for * sufficiently priviledged user) also a native buffer handle. This can * be used for seamless transitions between modesetting clients by * copying the current screen contents to a private buffer and blending * between that and the new contents. * * GEM based drivers should call drm_gem_handle_create() to create the * handle. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*create_handle)(struct drm_framebuffer *fb, struct drm_file *file_priv, unsigned int *handle); /** * @dirty: * * Optional callback for the dirty fb IOCTL. * * Userspace can notify the driver via this callback that an area of the * framebuffer has changed and should be flushed to the display * hardware. This can also be used internally, e.g. by the fbdev * emulation, though that's not the case currently. * * See documentation in drm_mode.h for the struct drm_mode_fb_dirty_cmd * for more information as all the semantics and arguments have a one to * one mapping on this function. * * Atomic drivers should use drm_atomic_helper_dirtyfb() to implement * this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*dirty)(struct drm_framebuffer *framebuffer, struct drm_file *file_priv, unsigned flags, unsigned color, struct drm_clip_rect *clips, unsigned num_clips); }; /** * struct drm_framebuffer - frame buffer object * * Note that the fb is refcounted for the benefit of driver internals, * for example some hw, disabling a CRTC/plane is asynchronous, and * scanout does not actually complete until the next vblank. So some * cleanup (like releasing the reference(s) on the backing GEM bo(s)) * should be deferred. In cases like this, the driver would like to * hold a ref to the fb even though it has already been removed from * userspace perspective. See drm_framebuffer_get() and * drm_framebuffer_put(). * * The refcount is stored inside the mode object @base. */ struct drm_framebuffer { /** * @dev: DRM device this framebuffer belongs to */ struct drm_device *dev; /** * @head: Place on the &drm_mode_config.fb_list, access protected by * &drm_mode_config.fb_lock. */ struct list_head head; /** * @base: base modeset object structure, contains the reference count. */ struct drm_mode_object base; /** * @comm: Name of the process allocating the fb, used for fb dumping. */ char comm[TASK_COMM_LEN]; /** * @format: framebuffer format information */ const struct drm_format_info *format; /** * @funcs: framebuffer vfunc table */ const struct drm_framebuffer_funcs *funcs; /** * @pitches: Line stride per buffer. For userspace created object this * is copied from drm_mode_fb_cmd2. */ unsigned int pitches[DRM_FORMAT_MAX_PLANES]; /** * @offsets: Offset from buffer start to the actual pixel data in bytes, * per buffer. For userspace created object this is copied from * drm_mode_fb_cmd2. * * Note that this is a linear offset and does not take into account * tiling or buffer layout per @modifier. It is meant to be used when * the actual pixel data for this framebuffer plane starts at an offset, * e.g. when multiple planes are allocated within the same backing * storage buffer object. For tiled layouts this generally means its * @offsets must at least be tile-size aligned, but hardware often has * stricter requirements. * * This should not be used to specifiy x/y pixel offsets into the buffer * data (even for linear buffers). Specifying an x/y pixel offset is * instead done through the source rectangle in &struct drm_plane_state. */ unsigned int offsets[DRM_FORMAT_MAX_PLANES]; /** * @modifier: Data layout modifier. This is used to describe * tiling, or also special layouts (like compression) of auxiliary * buffers. For userspace created object this is copied from * drm_mode_fb_cmd2. */ uint64_t modifier; /** * @width: Logical width of the visible area of the framebuffer, in * pixels. */ unsigned int width; /** * @height: Logical height of the visible area of the framebuffer, in * pixels. */ unsigned int height; /** * @flags: Framebuffer flags like DRM_MODE_FB_INTERLACED or * DRM_MODE_FB_MODIFIERS. */ int flags; /** * @filp_head: Placed on &drm_file.fbs, protected by &drm_file.fbs_lock. */ struct list_head filp_head; /** * @obj: GEM objects backing the framebuffer, one per plane (optional). * * This is used by the GEM framebuffer helpers, see e.g. * drm_gem_fb_create(). */ struct drm_gem_object *obj[DRM_FORMAT_MAX_PLANES]; }; #define obj_to_fb(x) container_of(x, struct drm_framebuffer, base) int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_framebuffer_funcs *funcs); struct drm_framebuffer *drm_framebuffer_lookup(struct drm_device *dev, struct drm_file *file_priv, uint32_t id); void drm_framebuffer_remove(struct drm_framebuffer *fb); void drm_framebuffer_cleanup(struct drm_framebuffer *fb); void drm_framebuffer_unregister_private(struct drm_framebuffer *fb); /** * drm_framebuffer_get - acquire a framebuffer reference * @fb: DRM framebuffer * * This function increments the framebuffer's reference count. */ static inline void drm_framebuffer_get(struct drm_framebuffer *fb) { drm_mode_object_get(&fb->base); } /** * drm_framebuffer_put - release a framebuffer reference * @fb: DRM framebuffer * * This function decrements the framebuffer's reference count and frees the * framebuffer if the reference count drops to zero. */ static inline void drm_framebuffer_put(struct drm_framebuffer *fb) { drm_mode_object_put(&fb->base); } /** * drm_framebuffer_read_refcount - read the framebuffer reference count. * @fb: framebuffer * * This functions returns the framebuffer's reference count. */ static inline uint32_t drm_framebuffer_read_refcount(const struct drm_framebuffer *fb) { return kref_read(&fb->base.refcount); } /** * drm_framebuffer_assign - store a reference to the fb * @p: location to store framebuffer * @fb: new framebuffer (maybe NULL) * * This functions sets the location to store a reference to the framebuffer, * unreferencing the framebuffer that was previously stored in that location. */ static inline void drm_framebuffer_assign(struct drm_framebuffer **p, struct drm_framebuffer *fb) { if (fb) drm_framebuffer_get(fb); if (*p) drm_framebuffer_put(*p); *p = fb; } /* * drm_for_each_fb - iterate over all framebuffers * @fb: the loop cursor * @dev: the DRM device * * Iterate over all framebuffers of @dev. User must hold * &drm_mode_config.fb_lock. */ #define drm_for_each_fb(fb, dev) \ for (WARN_ON(!mutex_is_locked(&(dev)->mode_config.fb_lock)), \ fb = list_first_entry(&(dev)->mode_config.fb_list, \ struct drm_framebuffer, head); \ &fb->head != (&(dev)->mode_config.fb_list); \ fb = list_next_entry(fb, head)) /** * struct drm_afbc_framebuffer - a special afbc frame buffer object * * A derived class of struct drm_framebuffer, dedicated for afbc use cases. */ struct drm_afbc_framebuffer { /** * @base: base framebuffer structure. */ struct drm_framebuffer base; /** * @block_width: width of a single afbc block */ u32 block_width; /** * @block_height: height of a single afbc block */ u32 block_height; /** * @aligned_width: aligned frame buffer width */ u32 aligned_width; /** * @aligned_height: aligned frame buffer height */ u32 aligned_height; /** * @offset: offset of the first afbc header */ u32 offset; /** * @afbc_size: minimum size of afbc buffer */ u32 afbc_size; }; #define fb_to_afbc_fb(x) container_of(x, struct drm_afbc_framebuffer, base) #endif
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/panic.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * This function is used through-out the kernel (including mm and fs) * to indicate a major problem. */ #include <linux/debug_locks.h> #include <linux/sched/debug.h> #include <linux/interrupt.h> #include <linux/kgdb.h> #include <linux/kmsg_dump.h> #include <linux/kallsyms.h> #include <linux/notifier.h> #include <linux/vt_kern.h> #include <linux/module.h> #include <linux/random.h> #include <linux/ftrace.h> #include <linux/reboot.h> #include <linux/delay.h> #include <linux/kexec.h> #include <linux/panic_notifier.h> #include <linux/sched.h> #include <linux/string_helpers.h> #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> #include <linux/console.h> #include <linux/bug.h> #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> #include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in an oops event? * Defaults to 0, can be changed via sysctl. */ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #else #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TASK_INFO 0x00000001 #define PANIC_PRINT_MEM_INFO 0x00000002 #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 #define PANIC_PRINT_ALL_CPU_BT 0x00000040 #define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL static struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", .data = &sysctl_oops_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "warn_limit", .data = &warn_limit, .maxlen = sizeof(warn_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_panic_sysctls_init(void) { register_sysctl_init("kernel", kern_panic_table); return 0; } late_initcall(kernel_panic_sysctls_init); #endif static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); } static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); static __init int kernel_panic_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); return 0; } late_initcall(kernel_panic_sysfs_init); #endif static long no_blink(int state) { return 0; } /* Returns how long it waited in ms */ long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); /* * Stop ourself in panic -- architecture code may override this */ void __weak __noreturn panic_smp_self_stop(void) { while (1) cpu_relax(); } /* * Stop ourselves in NMI context if another CPU has already panicked. Arch code * may override this to prepare for crash dumping, e.g. save regs info. */ void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs) { panic_smp_self_stop(); } /* * Stop other CPUs in panic. Architecture dependent code may override this * with more suitable version. For example, if the architecture supports * crash dump, it should save registers of each stopped CPU and disable * per-CPU features such as virtualization extensions. */ void __weak crash_smp_send_stop(void) { static int cpus_stopped; /* * This function can be called twice in panic path, but obviously * we execute this only once. */ if (cpus_stopped) return; /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic * situation. */ smp_send_stop(); cpus_stopped = 1; } atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in * nmi_panic_self_stop() which can provide architecture dependent code such * as saving register state for crash dump. */ void nmi_panic(struct pt_regs *regs, const char *msg) { int old_cpu, this_cpu; old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); /* atomic_try_cmpxchg updates old_cpu on failure */ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) panic("%s", msg); else if (old_cpu != this_cpu) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(bool console_flush) { if (console_flush) { if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) console_flush_on_panic(CONSOLE_REPLAY_ALL); return; } if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); if (panic_print & PANIC_PRINT_MEM_INFO) show_mem(); if (panic_print & PANIC_PRINT_TIMER_INFO) sysrq_timer_list_show(); if (panic_print & PANIC_PRINT_LOCK_INFO) debug_show_all_locks(); if (panic_print & PANIC_PRINT_FTRACE_INFO) ftrace_dump(DUMP_ALL); if (panic_print & PANIC_PRINT_BLOCKED_TASKS) show_state_filter(TASK_UNINTERRUPTIBLE); } void check_panic_on_warn(const char *origin) { unsigned int limit; if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); limit = READ_ONCE(warn_limit); if (atomic_inc_return(&warn_count) >= limit && limit) panic("%s: system warned too often (kernel.warn_limit is %d)", origin, limit); } /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have * the NMI backtrace after the CPUs are off! */ static void panic_other_cpus_shutdown(bool crash_kexec) { if (panic_print & PANIC_PRINT_ALL_CPU_BT) trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, * which unfortunately may not be hardened to work in a panic * situation. If we want to do crash dump after notifier calls * and kmsg_dump, we will need architecture dependent extra * bits in addition to stopping other CPUs, hence we rely on * crash_smp_send_stop() for that. */ if (!crash_kexec) smp_send_stop(); else crash_smp_send_stop(); } /** * panic - halt the system * @fmt: The text string to print * * Display a message, then perform cleanups. * * This function never returns. */ void panic(const char *fmt, ...) { static char buf[1024]; va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. * Resetting this prevents additional WARN() from panicking the * system on this thread. Other threads are blocked by the * panic_mutex in panic(). */ panic_on_warn = 0; } /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... * * Only one CPU is allowed to execute the panic code from here. For * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). * * cmpxchg success means this is the 1st CPU which comes here, * so go ahead. * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); /* atomic_try_cmpxchg updates old_cpu on failure */ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { /* go ahead */ } else if (old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) dump_stack(); #endif /* * If kgdb is enabled, give it a chance to run before we stop all * the other CPUs or else we won't be able to debug processes left * running on them. */ kgdb_panic(buf); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. * If we want to run this after calling panic_notifiers, pass * the "crash_kexec_post_notifiers" option to the kernel. * * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) __crash_kexec(NULL); panic_other_cpus_shutdown(_crash_kexec_post_notifiers); /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); panic_print_sys_info(false); kmsg_dump(KMSG_DUMP_PANIC); /* * If you doubt kdump always works fine in any situation, * "crash_kexec_post_notifiers" offers you a chance to run * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. * * Bypass the panic_cpu check and call __crash_kexec directly. */ if (_crash_kexec_post_notifiers) __crash_kexec(NULL); console_unblank(); /* * We may have ended up stopping the CPU holding the lock (in * smp_send_stop()) while still having some valuable data in the console * buffer. Try to acquire the lock then release it regardless of the * result. The release will also print the buffers out. Locks debug * should be disabled to avoid reporting bad unlock balance when * panic() is not being callled from OOPS. */ debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(true); if (!panic_blink) panic_blink = no_blink; if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ pr_emerg("Rebooting in %d seconds..\n", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); if (i >= i_next) { i += panic_blink(state ^= 1); i_next = i + 3600 / PANIC_BLINK_SPD; } mdelay(PANIC_TIMER_STEP); } } if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ if (panic_reboot_mode != REBOOT_UNDEFINED) reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ { extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n" "twice on console to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); /* Do not scroll important messages printed above */ suppress_printk = 1; /* * The final messages may not have been printed if in a context that * defers printing (such as NMI) and irq_work is not available. * Explicitly flush the kernel log buffer one last time. */ console_flush_on_panic(CONSOLE_FLUSH_PENDING); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); if (i >= i_next) { i += panic_blink(state ^= 1); i_next = i + 3600 / PANIC_BLINK_SPD; } mdelay(PANIC_TIMER_STEP); } } EXPORT_SYMBOL(panic); /* * TAINT_FORCED_RMMOD could be a per-module flag but the module * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, [ TAINT_USER ] = { 'U', ' ', false }, [ TAINT_DIE ] = { 'D', ' ', false }, [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, [ TAINT_WARN ] = { 'W', ' ', false }, [ TAINT_CRAP ] = { 'C', ' ', true }, [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, [ TAINT_AUX ] = { 'X', ' ', true }, [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, [ TAINT_TEST ] = { 'N', ' ', true }, }; /** * print_tainted - return a string to represent the kernel taint state. * * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst * * The string is overwritten by the next call to print_tainted(), * but is always NULL terminated. */ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); if (tainted_mask) { char *s; int i; s = buf + sprintf(buf, "Tainted: "); for (i = 0; i < TAINT_FLAGS_COUNT; i++) { const struct taint_flag *t = &taint_flags[i]; *s++ = test_bit(i, &tainted_mask) ? t->c_true : t->c_false; } *s = 0; } else snprintf(buf, sizeof(buf), "Not tainted"); return buf; } int test_taint(unsigned flag) { return test_bit(flag, &tainted_mask); } EXPORT_SYMBOL(test_taint); unsigned long get_taint(void) { return tainted_mask; } /** * add_taint: add a taint flag if not already set. * @flag: one of the TAINT_* constants. * @lockdep_ok: whether lock debugging is still OK. * * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for * some notewortht-but-not-corrupting cases, it can be set to true. */ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); if (tainted_mask & panic_on_taint) { panic_on_taint = 0; panic("panic_on_taint set ..."); } } EXPORT_SYMBOL(add_taint); static void spin_msec(int msecs) { int i; for (i = 0; i < msecs; i++) { touch_nmi_watchdog(); mdelay(1); } } /* * It just happens that oops_enter() and oops_exit() are identically * implemented... */ static void do_oops_enter_exit(void) { unsigned long flags; static int spin_counter; if (!pause_on_oops) return; spin_lock_irqsave(&pause_on_oops_lock, flags); if (pause_on_oops_flag == 0) { /* This CPU may now print the oops message */ pause_on_oops_flag = 1; } else { /* We need to stall this CPU */ if (!spin_counter) { /* This CPU gets to do the counting */ spin_counter = pause_on_oops; do { spin_unlock(&pause_on_oops_lock); spin_msec(MSEC_PER_SEC); spin_lock(&pause_on_oops_lock); } while (--spin_counter); pause_on_oops_flag = 0; } else { /* This CPU waits for a different one */ while (spin_counter) { spin_unlock(&pause_on_oops_lock); spin_msec(1); spin_lock(&pause_on_oops_lock); } } } spin_unlock_irqrestore(&pause_on_oops_lock, flags); } /* * Return true if the calling CPU is allowed to print oops-related info. * This is a bit racy.. */ bool oops_may_print(void) { return pause_on_oops_flag == 0; } /* * Called when the architecture enters its oops handler, before it prints * anything. If this is the first CPU to oops, and it's oopsing the first * time then let it proceed. * * This is all enabled by the pause_on_oops kernel boot option. We do all * this to ensure that oopses don't scroll off the screen. It has the * side-effect of preventing later-oopsing CPUs from mucking up the display, * too. * * It turns out that the CPU which is allowed to print ends up pausing for * the right duration, whereas all the other CPUs pause for twice as long: * once in oops_enter(), once in oops_exit(). */ void oops_enter(void) { tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); if (sysctl_oops_all_cpu_backtrace) trigger_all_cpu_backtrace(); } static void print_oops_end_marker(void) { pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* * Called when the architecture exits its oops handler, after printing * everything. */ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); kmsg_dump(KMSG_DUMP_OOPS); } struct warn_args { const char *fmt; va_list args; }; void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, caller); else pr_warn("WARNING: CPU: %d PID: %d at %pS\n", raw_smp_processor_id(), current->pid, caller); #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wsuggest-attribute=format" #endif if (args) vprintk(args->fmt, args->args); #pragma GCC diagnostic pop print_modules(); if (regs) show_regs(regs); check_panic_on_warn("kernel"); if (!regs) dump_stack(); print_irqtrace_events(current); print_oops_end_marker(); trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller); /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } #ifdef CONFIG_BUG #ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { bool rcu = warn_rcu_enter(); struct warn_args args; pr_warn(CUT_HERE); if (!fmt) { __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); warn_rcu_exit(rcu); return; } args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { bool rcu = warn_rcu_enter(); va_list args; pr_warn(CUT_HERE); va_start(args, fmt); vprintk(fmt, args); va_end(args); warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif /* Support resetting WARN*_ONCE state */ static int clear_warn_once_set(void *data, u64 val) { generic_bug_clear_once(); memset(__start_once, 0, __end_once - __start_once); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, &clear_warn_once_fops); return 0; } device_initcall(register_warn_debugfs); #endif #ifdef CONFIG_STACKPROTECTOR /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ __visible noinstr void __stack_chk_fail(void) { instrumentation_begin(); panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); static int __init oops_setup(char *s) { if (!s) return -EINVAL; if (!strcmp(s, "panic")) panic_on_oops = 1; return 0; } early_param("oops", oops_setup); static int __init panic_on_taint_setup(char *s) { char *taint_str; if (!s) return -EINVAL; taint_str = strsep(&s, ","); if (kstrtoul(taint_str, 16, &panic_on_taint)) return -EINVAL; /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ panic_on_taint &= TAINT_FLAGS_MAX; if (!panic_on_taint) return -EINVAL; if (s && !strcmp(s, "nousertaint")) panic_on_taint_nousertaint = true; pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n", panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint)); return 0; } early_param("panic_on_taint", panic_on_taint_setup);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 5 5 5 5 5 1 1 6 4 2 4 4 4 4 4 4 4 4 4 4 4 3 2 2 3 3 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)); /* _RINGS_TX_PUSH_BUF_LEN_MAX */ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam = {}; struct ethtool_ringparam ringparam = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); if (!mod) return 0; /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, };
1 1 1 1 2 2 2 2 1 1 1 1 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 /* * Copyright (c) 2014 Samsung Electronics Co., Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include <linux/err.h> #include <linux/media-bus-format.h> #include <linux/module.h> #include <linux/mutex.h> #include <drm/drm_atomic_state_helper.h> #include <drm/drm_bridge.h> #include <drm/drm_debugfs.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_file.h> #include <drm/drm_of.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" /** * DOC: overview * * &struct drm_bridge represents a device that hangs on to an encoder. These are * handy when a regular &drm_encoder entity isn't enough to represent the entire * encoder chain. * * A bridge is always attached to a single &drm_encoder at a time, but can be * either connected to it directly, or through a chain of bridges:: * * [ CRTC ---> ] Encoder ---> Bridge A ---> Bridge B * * Here, the output of the encoder feeds to bridge A, and that furthers feeds to * bridge B. Bridge chains can be arbitrarily long, and shall be fully linear: * Chaining multiple bridges to the output of a bridge, or the same bridge to * the output of different bridges, is not supported. * * &drm_bridge, like &drm_panel, aren't &drm_mode_object entities like planes, * CRTCs, encoders or connectors and hence are not visible to userspace. They * just provide additional hooks to get the desired output at the end of the * encoder chain. */ /** * DOC: display driver integration * * Display drivers are responsible for linking encoders with the first bridge * in the chains. This is done by acquiring the appropriate bridge with * devm_drm_of_get_bridge(). Once acquired, the bridge shall be attached to the * encoder with a call to drm_bridge_attach(). * * Bridges are responsible for linking themselves with the next bridge in the * chain, if any. This is done the same way as for encoders, with the call to * drm_bridge_attach() occurring in the &drm_bridge_funcs.attach operation. * * Once these links are created, the bridges can participate along with encoder * functions to perform mode validation and fixup (through * drm_bridge_chain_mode_valid() and drm_atomic_bridge_chain_check()), mode * setting (through drm_bridge_chain_mode_set()), enable (through * drm_atomic_bridge_chain_pre_enable() and drm_atomic_bridge_chain_enable()) * and disable (through drm_atomic_bridge_chain_disable() and * drm_atomic_bridge_chain_post_disable()). Those functions call the * corresponding operations provided in &drm_bridge_funcs in sequence for all * bridges in the chain. * * For display drivers that use the atomic helpers * drm_atomic_helper_check_modeset(), * drm_atomic_helper_commit_modeset_enables() and * drm_atomic_helper_commit_modeset_disables() (either directly in hand-rolled * commit check and commit tail handlers, or through the higher-level * drm_atomic_helper_check() and drm_atomic_helper_commit_tail() or * drm_atomic_helper_commit_tail_rpm() helpers), this is done transparently and * requires no intervention from the driver. For other drivers, the relevant * DRM bridge chain functions shall be called manually. * * Bridges also participate in implementing the &drm_connector at the end of * the bridge chain. Display drivers may use the drm_bridge_connector_init() * helper to create the &drm_connector, or implement it manually on top of the * connector-related operations exposed by the bridge (see the overview * documentation of bridge operations for more details). */ /** * DOC: special care dsi * * The interaction between the bridges and other frameworks involved in * the probing of the upstream driver and the bridge driver can be * challenging. Indeed, there's multiple cases that needs to be * considered: * * - The upstream driver doesn't use the component framework and isn't a * MIPI-DSI host. In this case, the bridge driver will probe at some * point and the upstream driver should try to probe again by returning * EPROBE_DEFER as long as the bridge driver hasn't probed. * * - The upstream driver doesn't use the component framework, but is a * MIPI-DSI host. The bridge device uses the MIPI-DCS commands to be * controlled. In this case, the bridge device is a child of the * display device and when it will probe it's assured that the display * device (and MIPI-DSI host) is present. The upstream driver will be * assured that the bridge driver is connected between the * &mipi_dsi_host_ops.attach and &mipi_dsi_host_ops.detach operations. * Therefore, it must run mipi_dsi_host_register() in its probe * function, and then run drm_bridge_attach() in its * &mipi_dsi_host_ops.attach hook. * * - The upstream driver uses the component framework and is a MIPI-DSI * host. The bridge device uses the MIPI-DCS commands to be * controlled. This is the same situation than above, and can run * mipi_dsi_host_register() in either its probe or bind hooks. * * - The upstream driver uses the component framework and is a MIPI-DSI * host. The bridge device uses a separate bus (such as I2C) to be * controlled. In this case, there's no correlation between the probe * of the bridge and upstream drivers, so care must be taken to avoid * an endless EPROBE_DEFER loop, with each driver waiting for the * other to probe. * * The ideal pattern to cover the last item (and all the others in the * MIPI-DSI host driver case) is to split the operations like this: * * - The MIPI-DSI host driver must run mipi_dsi_host_register() in its * probe hook. It will make sure that the MIPI-DSI host sticks around, * and that the driver's bind can be called. * * - In its probe hook, the bridge driver must try to find its MIPI-DSI * host, register as a MIPI-DSI device and attach the MIPI-DSI device * to its host. The bridge driver is now functional. * * - In its &struct mipi_dsi_host_ops.attach hook, the MIPI-DSI host can * now add its component. Its bind hook will now be called and since * the bridge driver is attached and registered, we can now look for * and attach it. * * At this point, we're now certain that both the upstream driver and * the bridge driver are functional and we can't have a deadlock-like * situation when probing. */ /** * DOC: dsi bridge operations * * DSI host interfaces are expected to be implemented as bridges rather than * encoders, however there are a few aspects of their operation that need to * be defined in order to provide a consistent interface. * * A DSI host should keep the PHY powered down until the pre_enable operation is * called. All lanes are in an undefined idle state up to this point, and it * must not be assumed that it is LP-11. * pre_enable should initialise the PHY, set the data lanes to LP-11, and the * clock lane to either LP-11 or HS depending on the mode_flag * %MIPI_DSI_CLOCK_NON_CONTINUOUS. * * Ordinarily the downstream bridge DSI peripheral pre_enable will have been * called before the DSI host. If the DSI peripheral requires LP-11 and/or * the clock lane to be in HS mode prior to pre_enable, then it can set the * &pre_enable_prev_first flag to request the pre_enable (and * post_disable) order to be altered to enable the DSI host first. * * Either the CRTC being enabled, or the DSI host enable operation should switch * the host to actively transmitting video on the data lanes. * * The reverse also applies. The DSI host disable operation or stopping the CRTC * should stop transmitting video, and the data lanes should return to the LP-11 * state. The DSI host &post_disable operation should disable the PHY. * If the &pre_enable_prev_first flag is set, then the DSI peripheral's * bridge &post_disable will be called before the DSI host's post_disable. * * Whilst it is valid to call &host_transfer prior to pre_enable or after * post_disable, the exact state of the lanes is undefined at this point. The * DSI host should initialise the interface, transmit the data, and then disable * the interface again. * * Ultra Low Power State (ULPS) is not explicitly supported by DRM. If * implemented, it therefore needs to be handled entirely within the DSI Host * driver. */ static DEFINE_MUTEX(bridge_lock); static LIST_HEAD(bridge_list); /** * drm_bridge_add - add the given bridge to the global bridge list * * @bridge: bridge control structure */ void drm_bridge_add(struct drm_bridge *bridge) { mutex_init(&bridge->hpd_mutex); mutex_lock(&bridge_lock); list_add_tail(&bridge->list, &bridge_list); mutex_unlock(&bridge_lock); } EXPORT_SYMBOL(drm_bridge_add); static void drm_bridge_remove_void(void *bridge) { drm_bridge_remove(bridge); } /** * devm_drm_bridge_add - devm managed version of drm_bridge_add() * * @dev: device to tie the bridge lifetime to * @bridge: bridge control structure * * This is the managed version of drm_bridge_add() which automatically * calls drm_bridge_remove() when @dev is unbound. * * Return: 0 if no error or negative error code. */ int devm_drm_bridge_add(struct device *dev, struct drm_bridge *bridge) { drm_bridge_add(bridge); return devm_add_action_or_reset(dev, drm_bridge_remove_void, bridge); } EXPORT_SYMBOL(devm_drm_bridge_add); /** * drm_bridge_remove - remove the given bridge from the global bridge list * * @bridge: bridge control structure */ void drm_bridge_remove(struct drm_bridge *bridge) { mutex_lock(&bridge_lock); list_del_init(&bridge->list); mutex_unlock(&bridge_lock); mutex_destroy(&bridge->hpd_mutex); } EXPORT_SYMBOL(drm_bridge_remove); static struct drm_private_state * drm_bridge_atomic_duplicate_priv_state(struct drm_private_obj *obj) { struct drm_bridge *bridge = drm_priv_to_bridge(obj); struct drm_bridge_state *state; state = bridge->funcs->atomic_duplicate_state(bridge); return state ? &state->base : NULL; } static void drm_bridge_atomic_destroy_priv_state(struct drm_private_obj *obj, struct drm_private_state *s) { struct drm_bridge_state *state = drm_priv_to_bridge_state(s); struct drm_bridge *bridge = drm_priv_to_bridge(obj); bridge->funcs->atomic_destroy_state(bridge, state); } static const struct drm_private_state_funcs drm_bridge_priv_state_funcs = { .atomic_duplicate_state = drm_bridge_atomic_duplicate_priv_state, .atomic_destroy_state = drm_bridge_atomic_destroy_priv_state, }; /** * drm_bridge_attach - attach the bridge to an encoder's chain * * @encoder: DRM encoder * @bridge: bridge to attach * @previous: previous bridge in the chain (optional) * @flags: DRM_BRIDGE_ATTACH_* flags * * Called by a kms driver to link the bridge to an encoder's chain. The previous * argument specifies the previous bridge in the chain. If NULL, the bridge is * linked directly at the encoder's output. Otherwise it is linked at the * previous bridge's output. * * If non-NULL the previous bridge must be already attached by a call to this * function. * * Note that bridges attached to encoders are auto-detached during encoder * cleanup in drm_encoder_cleanup(), so drm_bridge_attach() should generally * *not* be balanced with a drm_bridge_detach() in driver code. * * RETURNS: * Zero on success, error code on failure */ int drm_bridge_attach(struct drm_encoder *encoder, struct drm_bridge *bridge, struct drm_bridge *previous, enum drm_bridge_attach_flags flags) { int ret; if (!encoder || !bridge) return -EINVAL; if (previous && (!previous->dev || previous->encoder != encoder)) return -EINVAL; if (bridge->dev) return -EBUSY; bridge->dev = encoder->dev; bridge->encoder = encoder; if (previous) list_add(&bridge->chain_node, &previous->chain_node); else list_add(&bridge->chain_node, &encoder->bridge_chain); if (bridge->funcs->attach) { ret = bridge->funcs->attach(bridge, flags); if (ret < 0) goto err_reset_bridge; } if (bridge->funcs->atomic_reset) { struct drm_bridge_state *state; state = bridge->funcs->atomic_reset(bridge); if (IS_ERR(state)) { ret = PTR_ERR(state); goto err_detach_bridge; } drm_atomic_private_obj_init(bridge->dev, &bridge->base, &state->base, &drm_bridge_priv_state_funcs); } return 0; err_detach_bridge: if (bridge->funcs->detach) bridge->funcs->detach(bridge); err_reset_bridge: bridge->dev = NULL; bridge->encoder = NULL; list_del(&bridge->chain_node); #ifdef CONFIG_OF DRM_ERROR("failed to attach bridge %pOF to encoder %s: %d\n", bridge->of_node, encoder->name, ret); #else DRM_ERROR("failed to attach bridge to encoder %s: %d\n", encoder->name, ret); #endif return ret; } EXPORT_SYMBOL(drm_bridge_attach); void drm_bridge_detach(struct drm_bridge *bridge) { if (WARN_ON(!bridge)) return; if (WARN_ON(!bridge->dev)) return; if (bridge->funcs->atomic_reset) drm_atomic_private_obj_fini(&bridge->base); if (bridge->funcs->detach) bridge->funcs->detach(bridge); list_del(&bridge->chain_node); bridge->dev = NULL; } /** * DOC: bridge operations * * Bridge drivers expose operations through the &drm_bridge_funcs structure. * The DRM internals (atomic and CRTC helpers) use the helpers defined in * drm_bridge.c to call bridge operations. Those operations are divided in * three big categories to support different parts of the bridge usage. * * - The encoder-related operations support control of the bridges in the * chain, and are roughly counterparts to the &drm_encoder_helper_funcs * operations. They are used by the legacy CRTC and the atomic modeset * helpers to perform mode validation, fixup and setting, and enable and * disable the bridge automatically. * * The enable and disable operations are split in * &drm_bridge_funcs.pre_enable, &drm_bridge_funcs.enable, * &drm_bridge_funcs.disable and &drm_bridge_funcs.post_disable to provide * finer-grained control. * * Bridge drivers may implement the legacy version of those operations, or * the atomic version (prefixed with atomic\_), in which case they shall also * implement the atomic state bookkeeping operations * (&drm_bridge_funcs.atomic_duplicate_state, * &drm_bridge_funcs.atomic_destroy_state and &drm_bridge_funcs.reset). * Mixing atomic and non-atomic versions of the operations is not supported. * * - The bus format negotiation operations * &drm_bridge_funcs.atomic_get_output_bus_fmts and * &drm_bridge_funcs.atomic_get_input_bus_fmts allow bridge drivers to * negotiate the formats transmitted between bridges in the chain when * multiple formats are supported. Negotiation for formats is performed * transparently for display drivers by the atomic modeset helpers. Only * atomic versions of those operations exist, bridge drivers that need to * implement them shall thus also implement the atomic version of the * encoder-related operations. This feature is not supported by the legacy * CRTC helpers. * * - The connector-related operations support implementing a &drm_connector * based on a chain of bridges. DRM bridges traditionally create a * &drm_connector for bridges meant to be used at the end of the chain. This * puts additional burden on bridge drivers, especially for bridges that may * be used in the middle of a chain or at the end of it. Furthermore, it * requires all operations of the &drm_connector to be handled by a single * bridge, which doesn't always match the hardware architecture. * * To simplify bridge drivers and make the connector implementation more * flexible, a new model allows bridges to unconditionally skip creation of * &drm_connector and instead expose &drm_bridge_funcs operations to support * an externally-implemented &drm_connector. Those operations are * &drm_bridge_funcs.detect, &drm_bridge_funcs.get_modes, * &drm_bridge_funcs.get_edid, &drm_bridge_funcs.hpd_notify, * &drm_bridge_funcs.hpd_enable and &drm_bridge_funcs.hpd_disable. When * implemented, display drivers shall create a &drm_connector instance for * each chain of bridges, and implement those connector instances based on * the bridge connector operations. * * Bridge drivers shall implement the connector-related operations for all * the features that the bridge hardware support. For instance, if a bridge * supports reading EDID, the &drm_bridge_funcs.get_edid shall be * implemented. This however doesn't mean that the DDC lines are wired to the * bridge on a particular platform, as they could also be connected to an I2C * controller of the SoC. Support for the connector-related operations on the * running platform is reported through the &drm_bridge.ops flags. Bridge * drivers shall detect which operations they can support on the platform * (usually this information is provided by ACPI or DT), and set the * &drm_bridge.ops flags for all supported operations. A flag shall only be * set if the corresponding &drm_bridge_funcs operation is implemented, but * an implemented operation doesn't necessarily imply that the corresponding * flag will be set. Display drivers shall use the &drm_bridge.ops flags to * decide which bridge to delegate a connector operation to. This mechanism * allows providing a single static const &drm_bridge_funcs instance in * bridge drivers, improving security by storing function pointers in * read-only memory. * * In order to ease transition, bridge drivers may support both the old and * new models by making connector creation optional and implementing the * connected-related bridge operations. Connector creation is then controlled * by the flags argument to the drm_bridge_attach() function. Display drivers * that support the new model and create connectors themselves shall set the * %DRM_BRIDGE_ATTACH_NO_CONNECTOR flag, and bridge drivers shall then skip * connector creation. For intermediate bridges in the chain, the flag shall * be passed to the drm_bridge_attach() call for the downstream bridge. * Bridge drivers that implement the new model only shall return an error * from their &drm_bridge_funcs.attach handler when the * %DRM_BRIDGE_ATTACH_NO_CONNECTOR flag is not set. New display drivers * should use the new model, and convert the bridge drivers they use if * needed, in order to gradually transition to the new model. */ /** * drm_bridge_chain_mode_fixup - fixup proposed mode for all bridges in the * encoder chain * @bridge: bridge control structure * @mode: desired mode to be set for the bridge * @adjusted_mode: updated mode that works for this bridge * * Calls &drm_bridge_funcs.mode_fixup for all the bridges in the * encoder chain, starting from the first bridge to the last. * * Note: the bridge passed should be the one closest to the encoder * * RETURNS: * true on success, false on failure */ bool drm_bridge_chain_mode_fixup(struct drm_bridge *bridge, const struct drm_display_mode *mode, struct drm_display_mode *adjusted_mode) { struct drm_encoder *encoder; if (!bridge) return true; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { if (!bridge->funcs->mode_fixup) continue; if (!bridge->funcs->mode_fixup(bridge, mode, adjusted_mode)) return false; } return true; } EXPORT_SYMBOL(drm_bridge_chain_mode_fixup); /** * drm_bridge_chain_mode_valid - validate the mode against all bridges in the * encoder chain. * @bridge: bridge control structure * @info: display info against which the mode shall be validated * @mode: desired mode to be validated * * Calls &drm_bridge_funcs.mode_valid for all the bridges in the encoder * chain, starting from the first bridge to the last. If at least one bridge * does not accept the mode the function returns the error code. * * Note: the bridge passed should be the one closest to the encoder. * * RETURNS: * MODE_OK on success, drm_mode_status Enum error code on failure */ enum drm_mode_status drm_bridge_chain_mode_valid(struct drm_bridge *bridge, const struct drm_display_info *info, const struct drm_display_mode *mode) { struct drm_encoder *encoder; if (!bridge) return MODE_OK; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { enum drm_mode_status ret; if (!bridge->funcs->mode_valid) continue; ret = bridge->funcs->mode_valid(bridge, info, mode); if (ret != MODE_OK) return ret; } return MODE_OK; } EXPORT_SYMBOL(drm_bridge_chain_mode_valid); /** * drm_bridge_chain_mode_set - set proposed mode for all bridges in the * encoder chain * @bridge: bridge control structure * @mode: desired mode to be set for the encoder chain * @adjusted_mode: updated mode that works for this encoder chain * * Calls &drm_bridge_funcs.mode_set op for all the bridges in the * encoder chain, starting from the first bridge to the last. * * Note: the bridge passed should be the one closest to the encoder */ void drm_bridge_chain_mode_set(struct drm_bridge *bridge, const struct drm_display_mode *mode, const struct drm_display_mode *adjusted_mode) { struct drm_encoder *encoder; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { if (bridge->funcs->mode_set) bridge->funcs->mode_set(bridge, mode, adjusted_mode); } } EXPORT_SYMBOL(drm_bridge_chain_mode_set); /** * drm_atomic_bridge_chain_disable - disables all bridges in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_disable (falls back on * &drm_bridge_funcs.disable) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_disable * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *iter; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { if (iter->funcs->atomic_disable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, iter); if (WARN_ON(!old_bridge_state)) return; iter->funcs->atomic_disable(iter, old_bridge_state); } else if (iter->funcs->disable) { iter->funcs->disable(iter); } if (iter == bridge) break; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_disable); static void drm_atomic_bridge_call_post_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { if (old_state && bridge->funcs->atomic_post_disable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_post_disable(bridge, old_bridge_state); } else if (bridge->funcs->post_disable) { bridge->funcs->post_disable(bridge); } } /** * drm_atomic_bridge_chain_post_disable - cleans up after disabling all bridges * in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_post_disable (falls back on * &drm_bridge_funcs.post_disable) op for all the bridges in the encoder chain, * starting from the first bridge to the last. These are called after completing * &drm_encoder_helper_funcs.atomic_disable * * If a bridge sets @pre_enable_prev_first, then the @post_disable for that * bridge will be called before the previous one to reverse the @pre_enable * calling direction. * * Example: * Bridge A ---> Bridge B ---> Bridge C ---> Bridge D ---> Bridge E * * With pre_enable_prev_first flag enable in Bridge B, D, E then the resulting * @post_disable order would be, * Bridge B, Bridge A, Bridge E, Bridge D, Bridge C. * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_post_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *next, *limit; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { limit = NULL; if (!list_is_last(&bridge->chain_node, &encoder->bridge_chain)) { next = list_next_entry(bridge, chain_node); if (next->pre_enable_prev_first) { /* next bridge had requested that prev * was enabled first, so disabled last */ limit = next; /* Find the next bridge that has NOT requested * prev to be enabled first / disabled last */ list_for_each_entry_from(next, &encoder->bridge_chain, chain_node) { if (!next->pre_enable_prev_first) { next = list_prev_entry(next, chain_node); limit = next; break; } if (list_is_last(&next->chain_node, &encoder->bridge_chain)) { limit = next; break; } } /* Call these bridges in reverse order */ list_for_each_entry_from_reverse(next, &encoder->bridge_chain, chain_node) { if (next == bridge) break; drm_atomic_bridge_call_post_disable(next, old_state); } } } drm_atomic_bridge_call_post_disable(bridge, old_state); if (limit) /* Jump all bridges that we have already post_disabled */ bridge = limit; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_post_disable); static void drm_atomic_bridge_call_pre_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { if (old_state && bridge->funcs->atomic_pre_enable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_pre_enable(bridge, old_bridge_state); } else if (bridge->funcs->pre_enable) { bridge->funcs->pre_enable(bridge); } } /** * drm_atomic_bridge_chain_pre_enable - prepares for enabling all bridges in * the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_pre_enable (falls back on * &drm_bridge_funcs.pre_enable) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_enable * * If a bridge sets @pre_enable_prev_first, then the pre_enable for the * prev bridge will be called before pre_enable of this bridge. * * Example: * Bridge A ---> Bridge B ---> Bridge C ---> Bridge D ---> Bridge E * * With pre_enable_prev_first flag enable in Bridge B, D, E then the resulting * @pre_enable order would be, * Bridge C, Bridge D, Bridge E, Bridge A, Bridge B. * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_pre_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *iter, *next, *limit; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { if (iter->pre_enable_prev_first) { next = iter; limit = bridge; list_for_each_entry_from_reverse(next, &encoder->bridge_chain, chain_node) { if (next == bridge) break; if (!next->pre_enable_prev_first) { /* Found first bridge that does NOT * request prev to be enabled first */ limit = next; break; } } list_for_each_entry_from(next, &encoder->bridge_chain, chain_node) { /* Call requested prev bridge pre_enable * in order. */ if (next == iter) /* At the first bridge to request prev * bridges called first. */ break; drm_atomic_bridge_call_pre_enable(next, old_state); } } drm_atomic_bridge_call_pre_enable(iter, old_state); if (iter->pre_enable_prev_first) /* Jump all bridges that we have already pre_enabled */ iter = limit; if (iter == bridge) break; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_pre_enable); /** * drm_atomic_bridge_chain_enable - enables all bridges in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_enable (falls back on * &drm_bridge_funcs.enable) op for all the bridges in the encoder chain, * starting from the first bridge to the last. These are called after completing * &drm_encoder_helper_funcs.atomic_enable * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { if (bridge->funcs->atomic_enable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_enable(bridge, old_bridge_state); } else if (bridge->funcs->enable) { bridge->funcs->enable(bridge); } } } EXPORT_SYMBOL(drm_atomic_bridge_chain_enable); static int drm_atomic_bridge_check(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { if (bridge->funcs->atomic_check) { struct drm_bridge_state *bridge_state; int ret; bridge_state = drm_atomic_get_new_bridge_state(crtc_state->state, bridge); if (WARN_ON(!bridge_state)) return -EINVAL; ret = bridge->funcs->atomic_check(bridge, bridge_state, crtc_state, conn_state); if (ret) return ret; } else if (bridge->funcs->mode_fixup) { if (!bridge->funcs->mode_fixup(bridge, &crtc_state->mode, &crtc_state->adjusted_mode)) return -EINVAL; } return 0; } static int select_bus_fmt_recursive(struct drm_bridge *first_bridge, struct drm_bridge *cur_bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state, u32 out_bus_fmt) { unsigned int i, num_in_bus_fmts = 0; struct drm_bridge_state *cur_state; struct drm_bridge *prev_bridge; u32 *in_bus_fmts; int ret; prev_bridge = drm_bridge_get_prev_bridge(cur_bridge); cur_state = drm_atomic_get_new_bridge_state(crtc_state->state, cur_bridge); /* * If bus format negotiation is not supported by this bridge, let's * pass MEDIA_BUS_FMT_FIXED to the previous bridge in the chain and * hope that it can handle this situation gracefully (by providing * appropriate default values). */ if (!cur_bridge->funcs->atomic_get_input_bus_fmts) { if (cur_bridge != first_bridge) { ret = select_bus_fmt_recursive(first_bridge, prev_bridge, crtc_state, conn_state, MEDIA_BUS_FMT_FIXED); if (ret) return ret; } /* * Driver does not implement the atomic state hooks, but that's * fine, as long as it does not access the bridge state. */ if (cur_state) { cur_state->input_bus_cfg.format = MEDIA_BUS_FMT_FIXED; cur_state->output_bus_cfg.format = out_bus_fmt; } return 0; } /* * If the driver implements ->atomic_get_input_bus_fmts() it * should also implement the atomic state hooks. */ if (WARN_ON(!cur_state)) return -EINVAL; in_bus_fmts = cur_bridge->funcs->atomic_get_input_bus_fmts(cur_bridge, cur_state, crtc_state, conn_state, out_bus_fmt, &num_in_bus_fmts); if (!num_in_bus_fmts) return -ENOTSUPP; else if (!in_bus_fmts) return -ENOMEM; if (first_bridge == cur_bridge) { cur_state->input_bus_cfg.format = in_bus_fmts[0]; cur_state->output_bus_cfg.format = out_bus_fmt; kfree(in_bus_fmts); return 0; } for (i = 0; i < num_in_bus_fmts; i++) { ret = select_bus_fmt_recursive(first_bridge, prev_bridge, crtc_state, conn_state, in_bus_fmts[i]); if (ret != -ENOTSUPP) break; } if (!ret) { cur_state->input_bus_cfg.format = in_bus_fmts[i]; cur_state->output_bus_cfg.format = out_bus_fmt; } kfree(in_bus_fmts); return ret; } /* * This function is called by &drm_atomic_bridge_chain_check() just before * calling &drm_bridge_funcs.atomic_check() on all elements of the chain. * It performs bus format negotiation between bridge elements. The negotiation * happens in reverse order, starting from the last element in the chain up to * @bridge. * * Negotiation starts by retrieving supported output bus formats on the last * bridge element and testing them one by one. The test is recursive, meaning * that for each tested output format, the whole chain will be walked backward, * and each element will have to choose an input bus format that can be * transcoded to the requested output format. When a bridge element does not * support transcoding into a specific output format -ENOTSUPP is returned and * the next bridge element will have to try a different format. If none of the * combinations worked, -ENOTSUPP is returned and the atomic modeset will fail. * * This implementation is relying on * &drm_bridge_funcs.atomic_get_output_bus_fmts() and * &drm_bridge_funcs.atomic_get_input_bus_fmts() to gather supported * input/output formats. * * When &drm_bridge_funcs.atomic_get_output_bus_fmts() is not implemented by * the last element of the chain, &drm_atomic_bridge_chain_select_bus_fmts() * tries a single format: &drm_connector.display_info.bus_formats[0] if * available, MEDIA_BUS_FMT_FIXED otherwise. * * When &drm_bridge_funcs.atomic_get_input_bus_fmts() is not implemented, * &drm_atomic_bridge_chain_select_bus_fmts() skips the negotiation on the * bridge element that lacks this hook and asks the previous element in the * chain to try MEDIA_BUS_FMT_FIXED. It's up to bridge drivers to decide what * to do in that case (fail if they want to enforce bus format negotiation, or * provide a reasonable default if they need to support pipelines where not * all elements support bus format negotiation). */ static int drm_atomic_bridge_chain_select_bus_fmts(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { struct drm_connector *conn = conn_state->connector; struct drm_encoder *encoder = bridge->encoder; struct drm_bridge_state *last_bridge_state; unsigned int i, num_out_bus_fmts = 0; struct drm_bridge *last_bridge; u32 *out_bus_fmts; int ret = 0; last_bridge = list_last_entry(&encoder->bridge_chain, struct drm_bridge, chain_node); last_bridge_state = drm_atomic_get_new_bridge_state(crtc_state->state, last_bridge); if (last_bridge->funcs->atomic_get_output_bus_fmts) { const struct drm_bridge_funcs *funcs = last_bridge->funcs; /* * If the driver implements ->atomic_get_output_bus_fmts() it * should also implement the atomic state hooks. */ if (WARN_ON(!last_bridge_state)) return -EINVAL; out_bus_fmts = funcs->atomic_get_output_bus_fmts(last_bridge, last_bridge_state, crtc_state, conn_state, &num_out_bus_fmts); if (!num_out_bus_fmts) return -ENOTSUPP; else if (!out_bus_fmts) return -ENOMEM; } else { num_out_bus_fmts = 1; out_bus_fmts = kmalloc(sizeof(*out_bus_fmts), GFP_KERNEL); if (!out_bus_fmts) return -ENOMEM; if (conn->display_info.num_bus_formats && conn->display_info.bus_formats) out_bus_fmts[0] = conn->display_info.bus_formats[0]; else out_bus_fmts[0] = MEDIA_BUS_FMT_FIXED; } for (i = 0; i < num_out_bus_fmts; i++) { ret = select_bus_fmt_recursive(bridge, last_bridge, crtc_state, conn_state, out_bus_fmts[i]); if (ret != -ENOTSUPP) break; } kfree(out_bus_fmts); return ret; } static void drm_atomic_bridge_propagate_bus_flags(struct drm_bridge *bridge, struct drm_connector *conn, struct drm_atomic_state *state) { struct drm_bridge_state *bridge_state, *next_bridge_state; struct drm_bridge *next_bridge; u32 output_flags = 0; bridge_state = drm_atomic_get_new_bridge_state(state, bridge); /* No bridge state attached to this bridge => nothing to propagate. */ if (!bridge_state) return; next_bridge = drm_bridge_get_next_bridge(bridge); /* * Let's try to apply the most common case here, that is, propagate * display_info flags for the last bridge, and propagate the input * flags of the next bridge element to the output end of the current * bridge when the bridge is not the last one. * There are exceptions to this rule, like when signal inversion is * happening at the board level, but that's something drivers can deal * with from their &drm_bridge_funcs.atomic_check() implementation by * simply overriding the flags value we've set here. */ if (!next_bridge) { output_flags = conn->display_info.bus_flags; } else { next_bridge_state = drm_atomic_get_new_bridge_state(state, next_bridge); /* * No bridge state attached to the next bridge, just leave the * flags to 0. */ if (next_bridge_state) output_flags = next_bridge_state->input_bus_cfg.flags; } bridge_state->output_bus_cfg.flags = output_flags; /* * Propagate the output flags to the input end of the bridge. Again, it's * not necessarily what all bridges want, but that's what most of them * do, and by doing that by default we avoid forcing drivers to * duplicate the "dummy propagation" logic. */ bridge_state->input_bus_cfg.flags = output_flags; } /** * drm_atomic_bridge_chain_check() - Do an atomic check on the bridge chain * @bridge: bridge control structure * @crtc_state: new CRTC state * @conn_state: new connector state * * First trigger a bus format negotiation before calling * &drm_bridge_funcs.atomic_check() (falls back on * &drm_bridge_funcs.mode_fixup()) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_check() * * RETURNS: * 0 on success, a negative error code on failure */ int drm_atomic_bridge_chain_check(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { struct drm_connector *conn = conn_state->connector; struct drm_encoder *encoder; struct drm_bridge *iter; int ret; if (!bridge) return 0; ret = drm_atomic_bridge_chain_select_bus_fmts(bridge, crtc_state, conn_state); if (ret) return ret; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { int ret; /* * Bus flags are propagated by default. If a bridge needs to * tweak the input bus flags for any reason, it should happen * in its &drm_bridge_funcs.atomic_check() implementation such * that preceding bridges in the chain can propagate the new * bus flags. */ drm_atomic_bridge_propagate_bus_flags(iter, conn, crtc_state->state); ret = drm_atomic_bridge_check(iter, crtc_state, conn_state); if (ret) return ret; if (iter == bridge) break; } return 0; } EXPORT_SYMBOL(drm_atomic_bridge_chain_check); /** * drm_bridge_detect - check if anything is attached to the bridge output * @bridge: bridge control structure * * If the bridge supports output detection, as reported by the * DRM_BRIDGE_OP_DETECT bridge ops flag, call &drm_bridge_funcs.detect for the * bridge and return the connection status. Otherwise return * connector_status_unknown. * * RETURNS: * The detection status on success, or connector_status_unknown if the bridge * doesn't support output detection. */ enum drm_connector_status drm_bridge_detect(struct drm_bridge *bridge) { if (!(bridge->ops & DRM_BRIDGE_OP_DETECT)) return connector_status_unknown; return bridge->funcs->detect(bridge); } EXPORT_SYMBOL_GPL(drm_bridge_detect); /** * drm_bridge_get_modes - fill all modes currently valid for the sink into the * @connector * @bridge: bridge control structure * @connector: the connector to fill with modes * * If the bridge supports output modes retrieval, as reported by the * DRM_BRIDGE_OP_MODES bridge ops flag, call &drm_bridge_funcs.get_modes to * fill the connector with all valid modes and return the number of modes * added. Otherwise return 0. * * RETURNS: * The number of modes added to the connector. */ int drm_bridge_get_modes(struct drm_bridge *bridge, struct drm_connector *connector) { if (!(bridge->ops & DRM_BRIDGE_OP_MODES)) return 0; return bridge->funcs->get_modes(bridge, connector); } EXPORT_SYMBOL_GPL(drm_bridge_get_modes); /** * drm_bridge_edid_read - read the EDID data of the connected display * @bridge: bridge control structure * @connector: the connector to read EDID for * * If the bridge supports output EDID retrieval, as reported by the * DRM_BRIDGE_OP_EDID bridge ops flag, call &drm_bridge_funcs.edid_read to get * the EDID and return it. Otherwise return NULL. * * RETURNS: * The retrieved EDID on success, or NULL otherwise. */ const struct drm_edid *drm_bridge_edid_read(struct drm_bridge *bridge, struct drm_connector *connector) { if (!(bridge->ops & DRM_BRIDGE_OP_EDID)) return NULL; return bridge->funcs->edid_read(bridge, connector); } EXPORT_SYMBOL_GPL(drm_bridge_edid_read); /** * drm_bridge_hpd_enable - enable hot plug detection for the bridge * @bridge: bridge control structure * @cb: hot-plug detection callback * @data: data to be passed to the hot-plug detection callback * * Call &drm_bridge_funcs.hpd_enable if implemented and register the given @cb * and @data as hot plug notification callback. From now on the @cb will be * called with @data when an output status change is detected by the bridge, * until hot plug notification gets disabled with drm_bridge_hpd_disable(). * * Hot plug detection is supported only if the DRM_BRIDGE_OP_HPD flag is set in * bridge->ops. This function shall not be called when the flag is not set. * * Only one hot plug detection callback can be registered at a time, it is an * error to call this function when hot plug detection is already enabled for * the bridge. */ void drm_bridge_hpd_enable(struct drm_bridge *bridge, void (*cb)(void *data, enum drm_connector_status status), void *data) { if (!(bridge->ops & DRM_BRIDGE_OP_HPD)) return; mutex_lock(&bridge->hpd_mutex); if (WARN(bridge->hpd_cb, "Hot plug detection already enabled\n")) goto unlock; bridge->hpd_cb = cb; bridge->hpd_data = data; if (bridge->funcs->hpd_enable) bridge->funcs->hpd_enable(bridge); unlock: mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_enable); /** * drm_bridge_hpd_disable - disable hot plug detection for the bridge * @bridge: bridge control structure * * Call &drm_bridge_funcs.hpd_disable if implemented and unregister the hot * plug detection callback previously registered with drm_bridge_hpd_enable(). * Once this function returns the callback will not be called by the bridge * when an output status change occurs. * * Hot plug detection is supported only if the DRM_BRIDGE_OP_HPD flag is set in * bridge->ops. This function shall not be called when the flag is not set. */ void drm_bridge_hpd_disable(struct drm_bridge *bridge) { if (!(bridge->ops & DRM_BRIDGE_OP_HPD)) return; mutex_lock(&bridge->hpd_mutex); if (bridge->funcs->hpd_disable) bridge->funcs->hpd_disable(bridge); bridge->hpd_cb = NULL; bridge->hpd_data = NULL; mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_disable); /** * drm_bridge_hpd_notify - notify hot plug detection events * @bridge: bridge control structure * @status: output connection status * * Bridge drivers shall call this function to report hot plug events when they * detect a change in the output status, when hot plug detection has been * enabled by drm_bridge_hpd_enable(). * * This function shall be called in a context that can sleep. */ void drm_bridge_hpd_notify(struct drm_bridge *bridge, enum drm_connector_status status) { mutex_lock(&bridge->hpd_mutex); if (bridge->hpd_cb) bridge->hpd_cb(bridge->hpd_data, status); mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_notify); #ifdef CONFIG_OF /** * of_drm_find_bridge - find the bridge corresponding to the device node in * the global bridge list * * @np: device node * * RETURNS: * drm_bridge control struct on success, NULL on failure */ struct drm_bridge *of_drm_find_bridge(struct device_node *np) { struct drm_bridge *bridge; mutex_lock(&bridge_lock); list_for_each_entry(bridge, &bridge_list, list) { if (bridge->of_node == np) { mutex_unlock(&bridge_lock); return bridge; } } mutex_unlock(&bridge_lock); return NULL; } EXPORT_SYMBOL(of_drm_find_bridge); #endif MODULE_AUTHOR("Ajay Kumar <ajaykumar.rs@samsung.com>"); MODULE_DESCRIPTION("DRM bridge infrastructure"); MODULE_LICENSE("GPL and additional rights");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 /* * Generic 1-bit or 8-bit source to 1-32 bit destination expansion * for frame buffer located in system RAM with packed pixels of any depth. * * Based almost entirely on cfbimgblt.c * * Copyright (C) April 2007 Antonino Daplas <adaplas@pol.net> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/module.h> #include <linux/string.h> #include <linux/fb.h> #include <asm/types.h> #define DEBUG #ifdef DEBUG #define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt,__func__,## args) #else #define DPRINTK(fmt, args...) #endif static const u32 cfb_tab8_be[] = { 0x00000000,0x000000ff,0x0000ff00,0x0000ffff, 0x00ff0000,0x00ff00ff,0x00ffff00,0x00ffffff, 0xff000000,0xff0000ff,0xff00ff00,0xff00ffff, 0xffff0000,0xffff00ff,0xffffff00,0xffffffff }; static const u32 cfb_tab8_le[] = { 0x00000000,0xff000000,0x00ff0000,0xffff0000, 0x0000ff00,0xff00ff00,0x00ffff00,0xffffff00, 0x000000ff,0xff0000ff,0x00ff00ff,0xffff00ff, 0x0000ffff,0xff00ffff,0x00ffffff,0xffffffff }; static const u32 cfb_tab16_be[] = { 0x00000000, 0x0000ffff, 0xffff0000, 0xffffffff }; static const u32 cfb_tab16_le[] = { 0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff }; static const u32 cfb_tab32[] = { 0x00000000, 0xffffffff }; static void color_imageblit(const struct fb_image *image, struct fb_info *p, void *dst1, u32 start_index, u32 pitch_index) { /* Draw the penguin */ u32 *dst, *dst2; u32 color = 0, val, shift; int i, n, bpp = p->var.bits_per_pixel; u32 null_bits = 32 - bpp; u32 *palette = (u32 *) p->pseudo_palette; const u8 *src = image->data; dst2 = dst1; for (i = image->height; i--; ) { n = image->width; dst = dst1; shift = 0; val = 0; if (start_index) { u32 start_mask = ~(FB_SHIFT_HIGH(p, ~(u32)0, start_index)); val = *dst & start_mask; shift = start_index; } while (n--) { if (p->fix.visual == FB_VISUAL_TRUECOLOR || p->fix.visual == FB_VISUAL_DIRECTCOLOR ) color = palette[*src]; else color = *src; color <<= FB_LEFT_POS(p, bpp); val |= FB_SHIFT_HIGH(p, color, shift); if (shift >= null_bits) { *dst++ = val; val = (shift == null_bits) ? 0 : FB_SHIFT_LOW(p, color, 32 - shift); } shift += bpp; shift &= (32 - 1); src++; } if (shift) { u32 end_mask = FB_SHIFT_HIGH(p, ~(u32)0, shift); *dst &= end_mask; *dst |= val; } dst1 += p->fix.line_length; if (pitch_index) { dst2 += p->fix.line_length; dst1 = (u8 *)((long)dst2 & ~(sizeof(u32) - 1)); start_index += pitch_index; start_index &= 32 - 1; } } } static void slow_imageblit(const struct fb_image *image, struct fb_info *p, void *dst1, u32 fgcolor, u32 bgcolor, u32 start_index, u32 pitch_index) { u32 shift, color = 0, bpp = p->var.bits_per_pixel; u32 *dst, *dst2; u32 val, pitch = p->fix.line_length; u32 null_bits = 32 - bpp; u32 spitch = (image->width+7)/8; const u8 *src = image->data, *s; u32 i, j, l; dst2 = dst1; fgcolor <<= FB_LEFT_POS(p, bpp); bgcolor <<= FB_LEFT_POS(p, bpp); for (i = image->height; i--; ) { shift = val = 0; l = 8; j = image->width; dst = dst1; s = src; /* write leading bits */ if (start_index) { u32 start_mask = ~(FB_SHIFT_HIGH(p, ~(u32)0, start_index)); val = *dst & start_mask; shift = start_index; } while (j--) { l--; color = (*s & (1 << l)) ? fgcolor : bgcolor; val |= FB_SHIFT_HIGH(p, color, shift); /* Did the bitshift spill bits to the next long? */ if (shift >= null_bits) { *dst++ = val; val = (shift == null_bits) ? 0 : FB_SHIFT_LOW(p, color, 32 - shift); } shift += bpp; shift &= (32 - 1); if (!l) { l = 8; s++; } } /* write trailing bits */ if (shift) { u32 end_mask = FB_SHIFT_HIGH(p, ~(u32)0, shift); *dst &= end_mask; *dst |= val; } dst1 += pitch; src += spitch; if (pitch_index) { dst2 += pitch; dst1 = (u8 *)((long)dst2 & ~(sizeof(u32) - 1)); start_index += pitch_index; start_index &= 32 - 1; } } } /* * fast_imageblit - optimized monochrome color expansion * * Only if: bits_per_pixel == 8, 16, or 32 * image->width is divisible by pixel/dword (ppw); * fix->line_legth is divisible by 4; * beginning and end of a scanline is dword aligned */ static void fast_imageblit(const struct fb_image *image, struct fb_info *p, void *dst1, u32 fgcolor, u32 bgcolor) { u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel; u32 ppw = 32/bpp, spitch = (image->width + 7)/8; u32 bit_mask, eorx, shift; const u8 *s = image->data, *src; u32 *dst; const u32 *tab; size_t tablen; u32 colortab[16]; int i, j, k; switch (bpp) { case 8: tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le; tablen = 16; break; case 16: tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le; tablen = 4; break; case 32: tab = cfb_tab32; tablen = 2; break; default: return; } for (i = ppw-1; i--; ) { fgx <<= bpp; bgx <<= bpp; fgx |= fgcolor; bgx |= bgcolor; } bit_mask = (1 << ppw) - 1; eorx = fgx ^ bgx; k = image->width/ppw; for (i = 0; i < tablen; ++i) colortab[i] = (tab[i] & eorx) ^ bgx; for (i = image->height; i--; ) { dst = dst1; shift = 8; src = s; /* * Manually unroll the per-line copying loop for better * performance. This works until we processed the last * completely filled source byte (inclusive). */ switch (ppw) { case 4: /* 8 bpp */ for (j = k; j >= 2; j -= 2, ++src) { *dst++ = colortab[(*src >> 4) & bit_mask]; *dst++ = colortab[(*src >> 0) & bit_mask]; } break; case 2: /* 16 bpp */ for (j = k; j >= 4; j -= 4, ++src) { *dst++ = colortab[(*src >> 6) & bit_mask]; *dst++ = colortab[(*src >> 4) & bit_mask]; *dst++ = colortab[(*src >> 2) & bit_mask]; *dst++ = colortab[(*src >> 0) & bit_mask]; } break; case 1: /* 32 bpp */ for (j = k; j >= 8; j -= 8, ++src) { *dst++ = colortab[(*src >> 7) & bit_mask]; *dst++ = colortab[(*src >> 6) & bit_mask]; *dst++ = colortab[(*src >> 5) & bit_mask]; *dst++ = colortab[(*src >> 4) & bit_mask]; *dst++ = colortab[(*src >> 3) & bit_mask]; *dst++ = colortab[(*src >> 2) & bit_mask]; *dst++ = colortab[(*src >> 1) & bit_mask]; *dst++ = colortab[(*src >> 0) & bit_mask]; } break; } /* * For image widths that are not a multiple of 8, there * are trailing pixels left on the current line. Print * them as well. */ for (; j--; ) { shift -= ppw; *dst++ = colortab[(*src >> shift) & bit_mask]; if (!shift) { shift = 8; ++src; } } dst1 += p->fix.line_length; s += spitch; } } void sys_imageblit(struct fb_info *p, const struct fb_image *image) { u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0; u32 bpl = sizeof(u32), bpp = p->var.bits_per_pixel; u32 width = image->width; u32 dx = image->dx, dy = image->dy; void *dst1; if (p->state != FBINFO_STATE_RUNNING) return; if (!(p->flags & FBINFO_VIRTFB)) fb_warn_once(p, "Framebuffer is not in virtual address space."); bitstart = (dy * p->fix.line_length * 8) + (dx * bpp); start_index = bitstart & (32 - 1); pitch_index = (p->fix.line_length & (bpl - 1)) * 8; bitstart /= 8; bitstart &= ~(bpl - 1); dst1 = (void __force *)p->screen_base + bitstart; if (p->fbops->fb_sync) p->fbops->fb_sync(p); if (image->depth == 1) { if (p->fix.visual == FB_VISUAL_TRUECOLOR || p->fix.visual == FB_VISUAL_DIRECTCOLOR) { fgcolor = ((u32*)(p->pseudo_palette))[image->fg_color]; bgcolor = ((u32*)(p->pseudo_palette))[image->bg_color]; } else { fgcolor = image->fg_color; bgcolor = image->bg_color; } if (32 % bpp == 0 && !start_index && !pitch_index && ((width & (32/bpp-1)) == 0) && bpp >= 8 && bpp <= 32) fast_imageblit(image, p, dst1, fgcolor, bgcolor); else slow_imageblit(image, p, dst1, fgcolor, bgcolor, start_index, pitch_index); } else color_imageblit(image, p, dst1, start_index, pitch_index); } EXPORT_SYMBOL(sys_imageblit); MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>"); MODULE_DESCRIPTION("1-bit/8-bit to 1-32 bit color expansion (sys-to-sys)"); MODULE_LICENSE("GPL");
9 2 18 42 51 5 34 5 53 53 3 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H #include <crypto/algapi.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/jump_label.h> #include <linux/list.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/numa.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/types.h> struct akcipher_request; struct crypto_akcipher; struct crypto_instance; struct crypto_template; struct crypto_larval { struct crypto_alg alg; struct crypto_alg *adult; struct completion completion; u32 mask; bool test_started; }; struct crypto_akcipher_sync_data { struct crypto_akcipher *tfm; const void *src; void *dst; unsigned int slen; unsigned int dlen; struct akcipher_request *req; struct crypto_wait cwait; struct scatterlist sg; u8 *buf; }; enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, CRYPTOA_TYPE, __CRYPTOA_MAX, }; #define CRYPTOA_MAX (__CRYPTOA_MAX - 1) /* Maximum number of (rtattr) parameters for each template. */ #define CRYPTO_MAX_ATTRS 32 extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; int alg_test(const char *driver, const char *alg, u32 type, u32 mask); #ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS static inline bool crypto_boot_test_finished(void) { return true; } static inline void set_crypto_boot_test_finished(void) { } #else DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished); static inline bool crypto_boot_test_finished(void) { return static_branch_likely(&__crypto_boot_test_finished); } static inline void set_crypto_boot_test_finished(void) { static_branch_enable(&__crypto_boot_test_finished); } #endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); void __exit crypto_exit_proc(void); #else static inline void crypto_init_proc(void) { } static inline void crypto_exit_proc(void) { } #endif static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } struct crypto_alg *crypto_mod_get(struct crypto_alg *alg); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); void crypto_wait_for_test(struct crypto_larval *larval); void crypto_alg_tested(const char *name, int err); void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg); void crypto_remove_final(struct list_head *list); void crypto_shoot_alg(struct crypto_alg *alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask); void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm); int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data); int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err); int crypto_init_akcipher_ops_sig(struct crypto_tfm *tfm); static inline void *crypto_create_tfm(struct crypto_alg *alg, const struct crypto_type *frontend) { return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE); } struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask); void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node); static inline void *crypto_alloc_tfm(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE); } int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask); static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { refcount_inc(&alg->cra_refcnt); return alg; } static inline void crypto_alg_put(struct crypto_alg *alg) { if (refcount_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) alg->cra_destroy(alg); } static inline int crypto_tmpl_get(struct crypto_template *tmpl) { return try_module_get(tmpl->module); } static inline void crypto_tmpl_put(struct crypto_template *tmpl) { module_put(tmpl->module); } static inline int crypto_is_larval(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_LARVAL; } static inline int crypto_is_dead(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_DEAD; } static inline int crypto_is_moribund(struct crypto_alg *alg) { return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING); } static inline void crypto_notify(unsigned long val, void *v) { blocking_notifier_call_chain(&crypto_chain, val, v); } static inline void crypto_yield(u32 flags) { if (flags & CRYPTO_TFM_REQ_MAY_SLEEP) cond_resched(); } static inline int crypto_is_test_larval(struct crypto_larval *larval) { return larval->alg.cra_driver_name[0]; } static inline struct crypto_tfm *crypto_tfm_get(struct crypto_tfm *tfm) { return refcount_inc_not_zero(&tfm->refcnt) ? tfm : ERR_PTR(-EOVERFLOW); } #endif /* _CRYPTO_INTERNAL_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Linux VM pressure * * Copyright 2012 Linaro Ltd. * Anton Vorontsov <anton.vorontsov@linaro.org> * * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. */ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> /* * The window size (vmpressure_win) is the number of scanned pages before * we try to analyze scanned/reclaimed ratio. So the window is used as a * rate-limit tunable for the "low" level notification, and also for * averaging the ratio for medium/critical levels. Using small window * sizes can cause lot of false positives, but too big window size will * delay the notifications. * * As the vmscan reclaimer logic works with chunks which are multiple of * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. * * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; /* * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". * However, in that case the vmscan priority will raise fast as the * reclaimer will try to scan LRUs more deeply. * * The vmscan logic considers these special priorities: * * prio == DEF_PRIORITY (12): reclaimer starts with that value * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed * prio == 0 : close to OOM, kernel scans every page in an lru * * Any value in this range is acceptable for this tunable (i.e. from 12 to * 0). Current value for the vmpressure_level_critical_prio is chosen * empirically, but the number, in essence, means that we consider * critical level when scanning depth is ~10% of the lru size (vmscan * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one * eights). */ static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); static struct vmpressure *work_to_vmpressure(struct work_struct *work) { return container_of(work, struct vmpressure, work); } static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return memcg_to_vmpressure(memcg); } enum vmpressure_levels { VMPRESSURE_LOW = 0, VMPRESSURE_MEDIUM, VMPRESSURE_CRITICAL, VMPRESSURE_NUM_LEVELS, }; enum vmpressure_modes { VMPRESSURE_NO_PASSTHROUGH = 0, VMPRESSURE_HIERARCHY, VMPRESSURE_LOCAL, VMPRESSURE_NUM_MODES, }; static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; static const char * const vmpressure_str_modes[] = { [VMPRESSURE_NO_PASSTHROUGH] = "default", [VMPRESSURE_HIERARCHY] = "hierarchy", [VMPRESSURE_LOCAL] = "local", }; static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) return VMPRESSURE_CRITICAL; else if (pressure >= vmpressure_level_med) return VMPRESSURE_MEDIUM; return VMPRESSURE_LOW; } static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; unsigned long pressure = 0; /* * reclaimed can be greater than scanned for things such as reclaimed * slab pages. shrink_node() just adds reclaimed pages without a * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; /* * We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages * scanned. This makes it possible to set desired reaction time * and serves as a ratelimit. */ pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale; out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); return vmpressure_level(pressure); } struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, const enum vmpressure_levels level, bool ancestor, bool signalled) { struct vmpressure_event *ev; bool ret = false; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ancestor && ev->mode == VMPRESSURE_LOCAL) continue; if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) continue; if (level < ev->level) continue; eventfd_signal(ev->efd); ret = true; } mutex_unlock(&vmpr->events_lock); return ret; } static void vmpressure_work_fn(struct work_struct *work) { struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; bool ancestor = false; bool signalled = false; spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old * work context cleared the counters. In that case we will run * just after the old work returns, but then scanned might be zero * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } reclaimed = vmpr->tree_reclaimed; vmpr->tree_scanned = 0; vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); do { if (vmpressure_event(vmpr, level, ancestor, signalled)) signalled = true; ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * * This function should be called from the vmscan reclaim path to account * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * * If @tree is set, vmpressure is in traditional userspace reporting * mode: @memcg is considered the pressure root and userspace is * notified of the entire subtree's reclaim efficiency. * * If @tree is not set, reclaim efficiency is recorded for @memcg, and * only in-kernel users are notified. * * This function does not return any value. */ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; if (mem_cgroup_disabled()) return; /* * The in-kernel users only care about the reclaim efficiency * for this @memcg rather than the whole subtree, and there * isn't and won't be any in-kernel user in a legacy cgroup. */ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) return; vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to * help us with. For example, suppose that DMA zone is under * pressure; if we notify userland about that kind of pressure, * then it will be mostly a waste as it will trigger unnecessary * freeing of memory by userland (since userland is more likely to * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That * is why we include only movable, highmem and FS/IO pages. * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so * we account it too. */ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) return; /* * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the * current scanning depth. But it does not mean that we should * report the critical pressure, yet. If the scanning priority * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ if (!scanned) return; if (tree) { spin_lock(&vmpr->sr_lock); scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } else { enum vmpressure_levels level; /* For now, no users for root-level efficiency */ if (!memcg || mem_cgroup_is_root(memcg)) return; spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; if (scanned < vmpressure_win) { spin_unlock(&vmpr->sr_lock); return; } vmpr->scanned = vmpr->reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. * * For hysteresis keep the pressure state * asserted for a second in which subsequent * pressure events can occur. */ WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); } } } /** * vmpressure_prio() - Account memory pressure through reclaimer priority level * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @prio: reclaimer's priority * * This function should be called from the reclaim path every time when * the vmscan's reclaiming priority (scanning depth) changes. * * This function does not return any value. */ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) { /* * We only use prio for accounting critical level. For more info * see comment for vmpressure_level_critical_prio variable above. */ if (prio > vmpressure_level_critical_prio) return; /* * OK, the prio is below the threshold, updating vmpressure * information before shrinker dives into long shrinking of long * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 * to the vmpressure() basically means that we signal 'critical' * level. */ vmpressure(gfp, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the * @eventfd. The @args parameter is a comma-delimited string that denotes a * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. * "hierarchy" or "local"). * * To be used as memcg event method. * * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could * not be parsed. */ int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; enum vmpressure_levels level; char *spec, *spec_orig; char *token; int ret = 0; spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) return -ENOMEM; /* Find required level */ token = strsep(&spec, ","); ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); if (ret < 0) goto out; level = ret; /* Find optional mode */ token = strsep(&spec, ","); if (token) { ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); if (ret < 0) goto out; mode = ret; } ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { ret = -ENOMEM; goto out; } ev->efd = eventfd; ev->level = level; ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); ret = 0; out: kfree(spec_orig); return ret; } /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * * To be used as memcg event method. */ void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ev->efd != eventfd) continue; list_del(&ev->node); kfree(ev); break; } mutex_unlock(&vmpr->events_lock); } /** * vmpressure_init() - Initialize vmpressure control structure * @vmpr: Structure to be initialized * * This function should be called on every allocated vmpressure structure * before any usage. */ void vmpressure_init(struct vmpressure *vmpr) { spin_lock_init(&vmpr->sr_lock); mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); } /** * vmpressure_cleanup() - shuts down vmpressure control structure * @vmpr: Structure to be cleaned up * * This function should be called before the structure in which it is * embedded is cleaned up. */ void vmpressure_cleanup(struct vmpressure *vmpr) { /* * Make sure there is no pending work before eventfd infrastructure * goes away. */ flush_work(&vmpr->work); }
16 16 16 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0+ /** * DOC: vkms (Virtual Kernel Modesetting) * * VKMS is a software-only model of a KMS driver that is useful for testing * and for running X (or similar) on headless machines. VKMS aims to enable * a virtual display with no need of a hardware display capability, releasing * the GPU in DRM API tests. */ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/dma-mapping.h> #include <drm/drm_gem.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_drv.h> #include <drm/drm_fbdev_generic.h> #include <drm/drm_file.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_vblank.h> #include "vkms_drv.h" #include <drm/drm_print.h> #include <drm/drm_debugfs.h> #define DRIVER_NAME "vkms" #define DRIVER_DESC "Virtual Kernel Mode Setting" #define DRIVER_DATE "20180514" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vkms_config *default_config; static bool enable_cursor = true; module_param_named(enable_cursor, enable_cursor, bool, 0444); MODULE_PARM_DESC(enable_cursor, "Enable/Disable cursor support"); static bool enable_writeback = true; module_param_named(enable_writeback, enable_writeback, bool, 0444); MODULE_PARM_DESC(enable_writeback, "Enable/Disable writeback connector support"); static bool enable_overlay; module_param_named(enable_overlay, enable_overlay, bool, 0444); MODULE_PARM_DESC(enable_overlay, "Enable/Disable overlay support"); DEFINE_DRM_GEM_FOPS(vkms_driver_fops); static void vkms_release(struct drm_device *dev) { struct vkms_device *vkms = drm_device_to_vkms_device(dev); if (vkms->output.composer_workq) destroy_workqueue(vkms->output.composer_workq); } static void vkms_atomic_commit_tail(struct drm_atomic_state *old_state) { struct drm_device *dev = old_state->dev; struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state; int i; drm_atomic_helper_commit_modeset_disables(dev, old_state); drm_atomic_helper_commit_planes(dev, old_state, 0); drm_atomic_helper_commit_modeset_enables(dev, old_state); drm_atomic_helper_fake_vblank(old_state); drm_atomic_helper_commit_hw_done(old_state); drm_atomic_helper_wait_for_flip_done(dev, old_state); for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i) { struct vkms_crtc_state *vkms_state = to_vkms_crtc_state(old_crtc_state); flush_work(&vkms_state->composer_work); } drm_atomic_helper_cleanup_planes(dev, old_state); } static int vkms_config_show(struct seq_file *m, void *data) { struct drm_debugfs_entry *entry = m->private; struct drm_device *dev = entry->dev; struct vkms_device *vkmsdev = drm_device_to_vkms_device(dev); seq_printf(m, "writeback=%d\n", vkmsdev->config->writeback); seq_printf(m, "cursor=%d\n", vkmsdev->config->cursor); seq_printf(m, "overlay=%d\n", vkmsdev->config->overlay); return 0; } static const struct drm_debugfs_info vkms_config_debugfs_list[] = { { "vkms_config", vkms_config_show, 0 }, }; static const struct drm_driver vkms_driver = { .driver_features = DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_GEM, .release = vkms_release, .fops = &vkms_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC, .date = DRIVER_DATE, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int vkms_atomic_check(struct drm_device *dev, struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *new_crtc_state; int i; for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { if (!new_crtc_state->gamma_lut || !new_crtc_state->color_mgmt_changed) continue; if (new_crtc_state->gamma_lut->length / sizeof(struct drm_color_lut *) > VKMS_LUT_SIZE) return -EINVAL; } return drm_atomic_helper_check(dev, state); } static const struct drm_mode_config_funcs vkms_mode_funcs = { .fb_create = drm_gem_fb_create, .atomic_check = vkms_atomic_check, .atomic_commit = drm_atomic_helper_commit, }; static const struct drm_mode_config_helper_funcs vkms_mode_config_helpers = { .atomic_commit_tail = vkms_atomic_commit_tail, }; static int vkms_modeset_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; int ret; ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.funcs = &vkms_mode_funcs; dev->mode_config.min_width = XRES_MIN; dev->mode_config.min_height = YRES_MIN; dev->mode_config.max_width = XRES_MAX; dev->mode_config.max_height = YRES_MAX; dev->mode_config.cursor_width = 512; dev->mode_config.cursor_height = 512; /* FIXME: There's a confusion between bpp and depth between this and * fbdev helpers. We have to go with 0, meaning "pick the default", * which ix XRGB8888 in all cases. */ dev->mode_config.preferred_depth = 0; dev->mode_config.helper_private = &vkms_mode_config_helpers; return vkms_output_init(vkmsdev, 0); } static int vkms_create(struct vkms_config *config) { int ret; struct platform_device *pdev; struct vkms_device *vkms_device; pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); if (IS_ERR(pdev)) return PTR_ERR(pdev); if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } vkms_device = devm_drm_dev_alloc(&pdev->dev, &vkms_driver, struct vkms_device, drm); if (IS_ERR(vkms_device)) { ret = PTR_ERR(vkms_device); goto out_devres; } vkms_device->platform = pdev; vkms_device->config = config; config->dev = vkms_device; ret = dma_coerce_mask_and_coherent(vkms_device->drm.dev, DMA_BIT_MASK(64)); if (ret) { DRM_ERROR("Could not initialize DMA support\n"); goto out_devres; } ret = drm_vblank_init(&vkms_device->drm, 1); if (ret) { DRM_ERROR("Failed to vblank\n"); goto out_devres; } ret = vkms_modeset_init(vkms_device); if (ret) goto out_devres; drm_debugfs_add_files(&vkms_device->drm, vkms_config_debugfs_list, ARRAY_SIZE(vkms_config_debugfs_list)); ret = drm_dev_register(&vkms_device->drm, 0); if (ret) goto out_devres; drm_fbdev_generic_setup(&vkms_device->drm, 0); return 0; out_devres: devres_release_group(&pdev->dev, NULL); out_unregister: platform_device_unregister(pdev); return ret; } static int __init vkms_init(void) { int ret; struct vkms_config *config; config = kmalloc(sizeof(*config), GFP_KERNEL); if (!config) return -ENOMEM; default_config = config; config->cursor = enable_cursor; config->writeback = enable_writeback; config->overlay = enable_overlay; ret = vkms_create(config); if (ret) kfree(config); return ret; } static void vkms_destroy(struct vkms_config *config) { struct platform_device *pdev; if (!config->dev) { DRM_INFO("vkms_device is NULL.\n"); return; } pdev = config->dev->platform; drm_dev_unregister(&config->dev->drm); drm_atomic_helper_shutdown(&config->dev->drm); devres_release_group(&pdev->dev, NULL); platform_device_unregister(pdev); config->dev = NULL; } static void __exit vkms_exit(void) { if (default_config->dev) vkms_destroy(default_config); kfree(default_config); } module_init(vkms_init); module_exit(vkms_exit); MODULE_AUTHOR("Haneen Mohammed <hamohammed.sa@gmail.com>"); MODULE_AUTHOR("Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
58 36 51 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Regents of the University of California */ #ifndef _ASM_RISCV_TLB_H #define _ASM_RISCV_TLB_H struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_MMU #include <linux/swap.h> /* * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h * for more details. */ static inline void __tlb_remove_table(void *table) { free_page_and_swap_cache(table); } #endif /* CONFIG_MMU */ #define tlb_flush tlb_flush #include <asm-generic/tlb.h> static inline void tlb_flush(struct mmu_gather *tlb) { #ifdef CONFIG_MMU if (tlb->fullmm || tlb->need_flush_all || tlb->freed_tables) flush_tlb_mm(tlb->mm); else flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, tlb_get_unmap_size(tlb)); #endif } #endif /* _ASM_RISCV_TLB_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Paul Stewart * Copyright (c) 2001 Vojtech Pavlik * * HID char devices, giving access to raw HID device events. */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to Paul Stewart <stewart@wetlogic.net> */ #include <linux/poll.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/usb.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/nospec.h> #include "usbhid.h" #ifdef CONFIG_USB_DYNAMIC_MINORS #define HIDDEV_MINOR_BASE 0 #define HIDDEV_MINORS 256 #else #define HIDDEV_MINOR_BASE 96 #define HIDDEV_MINORS 16 #endif #define HIDDEV_BUFFER_SIZE 2048 struct hiddev_list { struct hiddev_usage_ref buffer[HIDDEV_BUFFER_SIZE]; int head; int tail; unsigned flags; struct fasync_struct *fasync; struct hiddev *hiddev; struct list_head node; struct mutex thread_lock; }; /* * Find a report, given the report's type and ID. The ID can be specified * indirectly by REPORT_ID_FIRST (which returns the first report of the given * type) or by (REPORT_ID_NEXT | old_id), which returns the next report of the * given type which follows old_id. */ static struct hid_report * hiddev_lookup_report(struct hid_device *hid, struct hiddev_report_info *rinfo) { unsigned int flags = rinfo->report_id & ~HID_REPORT_ID_MASK; unsigned int rid = rinfo->report_id & HID_REPORT_ID_MASK; struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; if (rinfo->report_type < HID_REPORT_TYPE_MIN || rinfo->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (rinfo->report_type - HID_REPORT_TYPE_MIN); switch (flags) { case 0: /* Nothing to do -- report_id is already set correctly */ break; case HID_REPORT_ID_FIRST: if (list_empty(&report_enum->report_list)) return NULL; list = report_enum->report_list.next; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; case HID_REPORT_ID_NEXT: report = report_enum->report_id_hash[rid]; if (!report) return NULL; list = report->list.next; if (list == &report_enum->report_list) return NULL; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; default: return NULL; } return report_enum->report_id_hash[rinfo->report_id]; } /* * Perform an exhaustive search of the report table for a usage, given its * type and usage id. */ static struct hid_field * hiddev_lookup_usage(struct hid_device *hid, struct hiddev_usage_ref *uref) { int i, j; struct hid_report *report; struct hid_report_enum *report_enum; struct hid_field *field; if (uref->report_type < HID_REPORT_TYPE_MIN || uref->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (uref->report_type - HID_REPORT_TYPE_MIN); list_for_each_entry(report, &report_enum->report_list, list) { for (i = 0; i < report->maxfield; i++) { field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == uref->usage_code) { uref->report_id = report->id; uref->field_index = i; uref->usage_index = j; return field; } } } } return NULL; } static void hiddev_send_event(struct hid_device *hid, struct hiddev_usage_ref *uref) { struct hiddev *hiddev = hid->hiddev; struct hiddev_list *list; unsigned long flags; spin_lock_irqsave(&hiddev->list_lock, flags); list_for_each_entry(list, &hiddev->list, node) { if (uref->field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { list->buffer[list->head] = *uref; list->head = (list->head + 1) & (HIDDEV_BUFFER_SIZE - 1); kill_fasync(&list->fasync, SIGIO, POLL_IN); } } spin_unlock_irqrestore(&hiddev->list_lock, flags); wake_up_interruptible(&hiddev->wait); } /* * This is where hid.c calls into hiddev to pass an event that occurred over * the interrupt pipe */ void hiddev_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) { unsigned type = field->report_type; struct hiddev_usage_ref uref; uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = field->report->id; uref.field_index = field->index; uref.usage_index = (usage - field->usage); uref.usage_code = usage->hid; uref.value = value; hiddev_send_event(hid, &uref); } EXPORT_SYMBOL_GPL(hiddev_hid_event); void hiddev_report_event(struct hid_device *hid, struct hid_report *report) { unsigned type = report->type; struct hiddev_usage_ref uref; memset(&uref, 0, sizeof(uref)); uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = report->id; uref.field_index = HID_FIELD_INDEX_NONE; hiddev_send_event(hid, &uref); } /* * fasync file op */ static int hiddev_fasync(int fd, struct file *file, int on) { struct hiddev_list *list = file->private_data; return fasync_helper(fd, file, on, &list->fasync); } /* * release file op */ static int hiddev_release(struct inode * inode, struct file * file) { struct hiddev_list *list = file->private_data; unsigned long flags; spin_lock_irqsave(&list->hiddev->list_lock, flags); list_del(&list->node); spin_unlock_irqrestore(&list->hiddev->list_lock, flags); mutex_lock(&list->hiddev->existancelock); if (!--list->hiddev->open) { if (list->hiddev->exist) { hid_hw_close(list->hiddev->hid); hid_hw_power(list->hiddev->hid, PM_HINT_NORMAL); } else { mutex_unlock(&list->hiddev->existancelock); kfree(list->hiddev); vfree(list); return 0; } } mutex_unlock(&list->hiddev->existancelock); vfree(list); return 0; } static int __hiddev_open(struct hiddev *hiddev, struct file *file) { struct hiddev_list *list; int error; lockdep_assert_held(&hiddev->existancelock); list = vzalloc(sizeof(*list)); if (!list) return -ENOMEM; mutex_init(&list->thread_lock); list->hiddev = hiddev; if (!hiddev->open++) { error = hid_hw_power(hiddev->hid, PM_HINT_FULLON); if (error < 0) goto err_drop_count; error = hid_hw_open(hiddev->hid); if (error < 0) goto err_normal_power; } spin_lock_irq(&hiddev->list_lock); list_add_tail(&list->node, &hiddev->list); spin_unlock_irq(&hiddev->list_lock); file->private_data = list; return 0; err_normal_power: hid_hw_power(hiddev->hid, PM_HINT_NORMAL); err_drop_count: hiddev->open--; vfree(list); return error; } /* * open file op */ static int hiddev_open(struct inode *inode, struct file *file) { struct usb_interface *intf; struct hid_device *hid; struct hiddev *hiddev; int res; intf = usbhid_find_interface(iminor(inode)); if (!intf) return -ENODEV; hid = usb_get_intfdata(intf); hiddev = hid->hiddev; mutex_lock(&hiddev->existancelock); res = hiddev->exist ? __hiddev_open(hiddev, file) : -ENODEV; mutex_unlock(&hiddev->existancelock); return res; } /* * "write" file op */ static ssize_t hiddev_write(struct file * file, const char __user * buffer, size_t count, loff_t *ppos) { return -EINVAL; } /* * "read" file op */ static ssize_t hiddev_read(struct file * file, char __user * buffer, size_t count, loff_t *ppos) { DEFINE_WAIT(wait); struct hiddev_list *list = file->private_data; int event_size; int retval; event_size = ((list->flags & HIDDEV_FLAG_UREF) != 0) ? sizeof(struct hiddev_usage_ref) : sizeof(struct hiddev_event); if (count < event_size) return 0; /* lock against other threads */ retval = mutex_lock_interruptible(&list->thread_lock); if (retval) return -ERESTARTSYS; while (retval == 0) { if (list->head == list->tail) { prepare_to_wait(&list->hiddev->wait, &wait, TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (!list->hiddev->exist) { retval = -EIO; break; } if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } /* let O_NONBLOCK tasks run */ mutex_unlock(&list->thread_lock); schedule(); if (mutex_lock_interruptible(&list->thread_lock)) { finish_wait(&list->hiddev->wait, &wait); return -EINTR; } set_current_state(TASK_INTERRUPTIBLE); } finish_wait(&list->hiddev->wait, &wait); } if (retval) { mutex_unlock(&list->thread_lock); return retval; } while (list->head != list->tail && retval + event_size <= count) { if ((list->flags & HIDDEV_FLAG_UREF) == 0) { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE) { struct hiddev_event event; event.hid = list->buffer[list->tail].usage_code; event.value = list->buffer[list->tail].value; if (copy_to_user(buffer + retval, &event, sizeof(struct hiddev_event))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_event); } } else { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { if (copy_to_user(buffer + retval, list->buffer + list->tail, sizeof(struct hiddev_usage_ref))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_usage_ref); } } list->tail = (list->tail + 1) & (HIDDEV_BUFFER_SIZE - 1); } } mutex_unlock(&list->thread_lock); return retval; } /* * "poll" file op * No kernel lock - fine */ static __poll_t hiddev_poll(struct file *file, poll_table *wait) { struct hiddev_list *list = file->private_data; poll_wait(file, &list->hiddev->wait, wait); if (list->head != list->tail) return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hiddev->exist) return EPOLLERR | EPOLLHUP; return 0; } /* * "ioctl" file op */ static noinline int hiddev_ioctl_usage(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct hiddev_report_info rinfo; struct hiddev_usage_ref_multi *uref_multi = NULL; struct hiddev_usage_ref *uref; struct hid_report *report; struct hid_field *field; int i; uref_multi = kmalloc(sizeof(struct hiddev_usage_ref_multi), GFP_KERNEL); if (!uref_multi) return -ENOMEM; uref = &uref_multi->uref; if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (copy_from_user(uref_multi, user_arg, sizeof(*uref_multi))) goto fault; } else { if (copy_from_user(uref, user_arg, sizeof(*uref))) goto fault; } switch (cmd) { case HIDIOCGUCODE: rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); uref->usage_code = field->usage[uref->usage_index].hid; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; default: if (cmd != HIDIOCGUSAGE && cmd != HIDIOCGUSAGES && uref->report_type == HID_REPORT_TYPE_INPUT) goto inval; if (uref->report_id == HID_REPORT_ID_UNKNOWN) { field = hiddev_lookup_usage(hid, uref); if (field == NULL) goto inval; } else { rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (cmd == HIDIOCGCOLLECTIONINDEX) { if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); } else if (uref->usage_index >= field->report_count) goto inval; } if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (uref_multi->num_values > HID_MAX_MULTI_USAGES || uref->usage_index + uref_multi->num_values > field->report_count) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->report_count - uref_multi->num_values); } switch (cmd) { case HIDIOCGUSAGE: if (uref->usage_index >= field->report_count) goto inval; uref->value = field->value[uref->usage_index]; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; case HIDIOCSUSAGE: if (uref->usage_index >= field->report_count) goto inval; field->value[uref->usage_index] = uref->value; goto goodreturn; case HIDIOCGCOLLECTIONINDEX: i = field->usage[uref->usage_index].collection_index; kfree(uref_multi); return i; case HIDIOCGUSAGES: for (i = 0; i < uref_multi->num_values; i++) uref_multi->values[i] = field->value[uref->usage_index + i]; if (copy_to_user(user_arg, uref_multi, sizeof(*uref_multi))) goto fault; goto goodreturn; case HIDIOCSUSAGES: for (i = 0; i < uref_multi->num_values; i++) field->value[uref->usage_index + i] = uref_multi->values[i]; goto goodreturn; } goodreturn: kfree(uref_multi); return 0; fault: kfree(uref_multi); return -EFAULT; inval: kfree(uref_multi); return -EINVAL; } } static noinline int hiddev_ioctl_string(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct usb_device *dev = hid_to_usb_dev(hid); int idx, len; char *buf; if (get_user(idx, (int __user *)user_arg)) return -EFAULT; if ((buf = kmalloc(HID_STRING_SIZE, GFP_KERNEL)) == NULL) return -ENOMEM; if ((len = usb_string(dev, idx, buf, HID_STRING_SIZE-1)) < 0) { kfree(buf); return -EINVAL; } if (copy_to_user(user_arg+sizeof(int), buf, len+1)) { kfree(buf); return -EFAULT; } kfree(buf); return len; } static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hiddev_list *list = file->private_data; struct hiddev *hiddev = list->hiddev; struct hid_device *hid; struct hiddev_collection_info cinfo; struct hiddev_report_info rinfo; struct hiddev_field_info finfo; struct hiddev_devinfo dinfo; struct hid_report *report; struct hid_field *field; void __user *user_arg = (void __user *)arg; int i, r = -EINVAL; /* Called without BKL by compat methods so no BKL taken */ mutex_lock(&hiddev->existancelock); if (!hiddev->exist) { r = -ENODEV; goto ret_unlock; } hid = hiddev->hid; switch (cmd) { case HIDIOCGVERSION: r = put_user(HID_VERSION, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCAPPLICATION: if (arg >= hid->maxapplication) break; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && arg-- == 0) break; if (i < hid->maxcollection) r = hid->collection[i].usage; break; case HIDIOCGDEVINFO: { struct usb_device *dev = hid_to_usb_dev(hid); struct usbhid_device *usbhid = hid->driver_data; memset(&dinfo, 0, sizeof(dinfo)); dinfo.bustype = BUS_USB; dinfo.busnum = dev->bus->busnum; dinfo.devnum = dev->devnum; dinfo.ifnum = usbhid->ifnum; dinfo.vendor = le16_to_cpu(dev->descriptor.idVendor); dinfo.product = le16_to_cpu(dev->descriptor.idProduct); dinfo.version = le16_to_cpu(dev->descriptor.bcdDevice); dinfo.num_applications = hid->maxapplication; r = copy_to_user(user_arg, &dinfo, sizeof(dinfo)) ? -EFAULT : 0; break; } case HIDIOCGFLAG: r = put_user(list->flags, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCSFLAG: { int newflags; if (get_user(newflags, (int __user *)arg)) { r = -EFAULT; break; } if ((newflags & ~HIDDEV_FLAGS) != 0 || ((newflags & HIDDEV_FLAG_REPORT) != 0 && (newflags & HIDDEV_FLAG_UREF) == 0)) break; list->flags = newflags; r = 0; break; } case HIDIOCGSTRING: r = hiddev_ioctl_string(hiddev, cmd, user_arg); break; case HIDIOCINITREPORT: usbhid_init_reports(hid); hiddev->initialized = true; r = 0; break; case HIDIOCGREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_OUTPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_GET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCSREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_INPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_SET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCGREPORTINFO: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; rinfo.num_fields = report->maxfield; r = copy_to_user(user_arg, &rinfo, sizeof(rinfo)) ? -EFAULT : 0; break; case HIDIOCGFIELDINFO: if (copy_from_user(&finfo, user_arg, sizeof(finfo))) { r = -EFAULT; break; } rinfo.report_type = finfo.report_type; rinfo.report_id = finfo.report_id; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; if (finfo.field_index >= report->maxfield) break; finfo.field_index = array_index_nospec(finfo.field_index, report->maxfield); field = report->field[finfo.field_index]; memset(&finfo, 0, sizeof(finfo)); finfo.report_type = rinfo.report_type; finfo.report_id = rinfo.report_id; finfo.field_index = field->report_count - 1; finfo.maxusage = field->maxusage; finfo.flags = field->flags; finfo.physical = field->physical; finfo.logical = field->logical; finfo.application = field->application; finfo.logical_minimum = field->logical_minimum; finfo.logical_maximum = field->logical_maximum; finfo.physical_minimum = field->physical_minimum; finfo.physical_maximum = field->physical_maximum; finfo.unit_exponent = field->unit_exponent; finfo.unit = field->unit; r = copy_to_user(user_arg, &finfo, sizeof(finfo)) ? -EFAULT : 0; break; case HIDIOCGUCODE: case HIDIOCGUSAGE: case HIDIOCSUSAGE: case HIDIOCGUSAGES: case HIDIOCSUSAGES: case HIDIOCGCOLLECTIONINDEX: if (!hiddev->initialized) { usbhid_init_reports(hid); hiddev->initialized = true; } r = hiddev_ioctl_usage(hiddev, cmd, user_arg); break; case HIDIOCGCOLLECTIONINFO: if (copy_from_user(&cinfo, user_arg, sizeof(cinfo))) { r = -EFAULT; break; } if (cinfo.index >= hid->maxcollection) break; cinfo.index = array_index_nospec(cinfo.index, hid->maxcollection); cinfo.type = hid->collection[cinfo.index].type; cinfo.usage = hid->collection[cinfo.index].usage; cinfo.level = hid->collection[cinfo.index].level; r = copy_to_user(user_arg, &cinfo, sizeof(cinfo)) ? -EFAULT : 0; break; default: if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) break; if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } } ret_unlock: mutex_unlock(&hiddev->existancelock); return r; } static const struct file_operations hiddev_fops = { .owner = THIS_MODULE, .read = hiddev_read, .write = hiddev_write, .poll = hiddev_poll, .open = hiddev_open, .release = hiddev_release, .unlocked_ioctl = hiddev_ioctl, .fasync = hiddev_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static char *hiddev_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver hiddev_class = { .name = "hiddev%d", .devnode = hiddev_devnode, .fops = &hiddev_fops, .minor_base = HIDDEV_MINOR_BASE, }; /* * This is where hid.c calls us to connect a hid device to the hiddev driver */ int hiddev_connect(struct hid_device *hid, unsigned int force) { struct hiddev *hiddev; struct usbhid_device *usbhid = hid->driver_data; int retval; if (!force) { unsigned int i; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && !IS_INPUT_APPLICATION(hid->collection[i].usage)) break; if (i == hid->maxcollection) return -EINVAL; } if (!(hiddev = kzalloc(sizeof(struct hiddev), GFP_KERNEL))) return -ENOMEM; init_waitqueue_head(&hiddev->wait); INIT_LIST_HEAD(&hiddev->list); spin_lock_init(&hiddev->list_lock); mutex_init(&hiddev->existancelock); hid->hiddev = hiddev; hiddev->hid = hid; hiddev->exist = 1; retval = usb_register_dev(usbhid->intf, &hiddev_class); if (retval) { hid_err(hid, "Not able to get a minor for this device\n"); hid->hiddev = NULL; kfree(hiddev); return retval; } /* * If HID_QUIRK_NO_INIT_REPORTS is set, make sure we don't initialize * the reports. */ hiddev->initialized = hid->quirks & HID_QUIRK_NO_INIT_REPORTS; hiddev->minor = usbhid->intf->minor; return 0; } /* * This is where hid.c calls us to disconnect a hiddev device from the * corresponding hid device (usually because the usb device has disconnected) */ static struct usb_class_driver hiddev_class; void hiddev_disconnect(struct hid_device *hid) { struct hiddev *hiddev = hid->hiddev; struct usbhid_device *usbhid = hid->driver_data; usb_deregister_dev(usbhid->intf, &hiddev_class); mutex_lock(&hiddev->existancelock); hiddev->exist = 0; if (hiddev->open) { hid_hw_close(hiddev->hid); wake_up_interruptible(&hiddev->wait); mutex_unlock(&hiddev->existancelock); } else { mutex_unlock(&hiddev->existancelock); kfree(hiddev); } }
1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2022 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Fair Capacity and Weighted Fair Queueing handling * RFC 8260 section 3.5 and 3.6 */ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream); static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; if (!weight) return -EINVAL; soute->fc_weight = weight; return 0; } static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; *value = soute->fc_weight; return 0; } static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { return 0; } static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { return 0; } static int sctp_sched_fc_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->fc_list); return 0; } static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&soute->fc_list); soute->fc_length = 0; soute->fc_weight = 1; return 0; } static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fc_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_out_ext *pos; if (!list_empty(&soute->fc_list)) return; list_for_each_entry(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_add_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) return NULL; /* Find which chunk is next */ if (stream->out_curr) soute = stream->out_curr->ext; else soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list); ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); return ch; } static void sctp_sched_fc_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute, *pos; __u16 sid, i; sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(stream, sid)->ext; /* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */ if (soute->fc_length > U32_MAX - ch->skb->len) { for (i = 0; i < stream->outcnt; i++) { pos = SCTP_SO(stream, i)->ext; if (!pos) continue; if (pos->fc_length <= (U32_MAX >> 2)) { pos->fc_length = 0; continue; } pos->fc_length -= (U32_MAX >> 2); } } soute->fc_length += ch->skb->len; if (list_empty(&soute->outq)) { list_del_init(&soute->fc_list); return; } pos = soute; list_for_each_entry_continue(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_move_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (SCTP_SO(stream, sid)->ext) sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } } static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) { struct sctp_stream_out_ext *soute, *tmp; list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list) list_del_init(&soute->fc_list); } static struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_fc_init(void) { sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } static struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_wfq_init(void) { sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq); }
41 1 5 31 77 1164 827 277 344 809 809 1026 4247 7 21 1 44 5 2 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/sched.h> #include <linux/thread_info.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } #ifdef INLINE_COPY_FROM_USER static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } #else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (check_copy_size(to, n, false)) n = _copy_from_user(to, from, n); return n; } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (check_copy_size(from, n, true)) n = _copy_to_user(to, from, n); return n; } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifndef __get_kernel_nofault #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */
63 67 51 54 53 54 54 54 54 53 54 1 1 1 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) if (index <= 0 && try_module_get(tmp->owner)) break; read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type);
48 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-long.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_LONG_H #define _LINUX_ATOMIC_LONG_H #include <linux/compiler.h> #include <asm/types.h> #ifdef CONFIG_64BIT typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define atomic_long_cond_read_acquire atomic64_cond_read_acquire #define atomic_long_cond_read_relaxed atomic64_cond_read_relaxed #else typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define atomic_long_cond_read_acquire atomic_cond_read_acquire #define atomic_long_cond_read_relaxed atomic_cond_read_relaxed #endif /** * raw_atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read(v); #else return raw_atomic_read(v); #endif } /** * raw_atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read_acquire(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read_acquire(v); #else return raw_atomic_read_acquire(v); #endif } /** * raw_atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set(v, i); #else raw_atomic_set(v, i); #endif } /** * raw_atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set_release(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set_release(v, i); #else raw_atomic_set_release(v, i); #endif } /** * raw_atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_add(i, v); #else raw_atomic_add(i, v); #endif } /** * raw_atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return(i, v); #else return raw_atomic_add_return(i, v); #endif } /** * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_acquire(i, v); #else return raw_atomic_add_return_acquire(i, v); #endif } /** * raw_atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_release(i, v); #else return raw_atomic_add_return_release(i, v); #endif } /** * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_relaxed(i, v); #else return raw_atomic_add_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add(i, v); #else return raw_atomic_fetch_add(i, v); #endif } /** * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_acquire(i, v); #else return raw_atomic_fetch_add_acquire(i, v); #endif } /** * raw_atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_release(i, v); #else return raw_atomic_fetch_add_release(i, v); #endif } /** * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_relaxed(i, v); #else return raw_atomic_fetch_add_relaxed(i, v); #endif } /** * raw_atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_sub(i, v); #else raw_atomic_sub(i, v); #endif } /** * raw_atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return(i, v); #else return raw_atomic_sub_return(i, v); #endif } /** * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_acquire(i, v); #else return raw_atomic_sub_return_acquire(i, v); #endif } /** * raw_atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_release(i, v); #else return raw_atomic_sub_return_release(i, v); #endif } /** * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_relaxed(i, v); #else return raw_atomic_sub_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub(i, v); #else return raw_atomic_fetch_sub(i, v); #endif } /** * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_acquire(i, v); #else return raw_atomic_fetch_sub_acquire(i, v); #endif } /** * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_release(i, v); #else return raw_atomic_fetch_sub_release(i, v); #endif } /** * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_relaxed(i, v); #else return raw_atomic_fetch_sub_relaxed(i, v); #endif } /** * raw_atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_inc(v); #else raw_atomic_inc(v); #endif } /** * raw_atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return(v); #else return raw_atomic_inc_return(v); #endif } /** * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_acquire(v); #else return raw_atomic_inc_return_acquire(v); #endif } /** * raw_atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_release(v); #else return raw_atomic_inc_return_release(v); #endif } /** * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_relaxed(v); #else return raw_atomic_inc_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc(v); #else return raw_atomic_fetch_inc(v); #endif } /** * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_acquire(v); #else return raw_atomic_fetch_inc_acquire(v); #endif } /** * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_release(v); #else return raw_atomic_fetch_inc_release(v); #endif } /** * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_relaxed(v); #else return raw_atomic_fetch_inc_relaxed(v); #endif } /** * raw_atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_dec(v); #else raw_atomic_dec(v); #endif } /** * raw_atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return(v); #else return raw_atomic_dec_return(v); #endif } /** * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_acquire(v); #else return raw_atomic_dec_return_acquire(v); #endif } /** * raw_atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_release(v); #else return raw_atomic_dec_return_release(v); #endif } /** * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_relaxed(v); #else return raw_atomic_dec_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec(v); #else return raw_atomic_fetch_dec(v); #endif } /** * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_acquire(v); #else return raw_atomic_fetch_dec_acquire(v); #endif } /** * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_release(v); #else return raw_atomic_fetch_dec_release(v); #endif } /** * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_relaxed(v); #else return raw_atomic_fetch_dec_relaxed(v); #endif } /** * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_and(i, v); #else raw_atomic_and(i, v); #endif } /** * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and(i, v); #else return raw_atomic_fetch_and(i, v); #endif } /** * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_acquire(i, v); #else return raw_atomic_fetch_and_acquire(i, v); #endif } /** * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_release(i, v); #else return raw_atomic_fetch_and_release(i, v); #endif } /** * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_relaxed(i, v); #else return raw_atomic_fetch_and_relaxed(i, v); #endif } /** * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_andnot(i, v); #else raw_atomic_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot(i, v); #else return raw_atomic_fetch_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_acquire(i, v); #else return raw_atomic_fetch_andnot_acquire(i, v); #endif } /** * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_release(i, v); #else return raw_atomic_fetch_andnot_release(i, v); #endif } /** * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_relaxed(i, v); #else return raw_atomic_fetch_andnot_relaxed(i, v); #endif } /** * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_or(i, v); #else raw_atomic_or(i, v); #endif } /** * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or(i, v); #else return raw_atomic_fetch_or(i, v); #endif } /** * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_acquire(i, v); #else return raw_atomic_fetch_or_acquire(i, v); #endif } /** * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_release(i, v); #else return raw_atomic_fetch_or_release(i, v); #endif } /** * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_relaxed(i, v); #else return raw_atomic_fetch_or_relaxed(i, v); #endif } /** * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_xor(i, v); #else raw_atomic_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor(i, v); #else return raw_atomic_fetch_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_acquire(i, v); #else return raw_atomic_fetch_xor_acquire(i, v); #endif } /** * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_release(i, v); #else return raw_atomic_fetch_xor_release(i, v); #endif } /** * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_relaxed(i, v); #else return raw_atomic_fetch_xor_relaxed(i, v); #endif } /** * raw_atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg(v, new); #else return raw_atomic_xchg(v, new); #endif } /** * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_acquire(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_acquire(v, new); #else return raw_atomic_xchg_acquire(v, new); #endif } /** * raw_atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_release(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_release(v, new); #else return raw_atomic_xchg_release(v, new); #endif } /** * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_relaxed(v, new); #else return raw_atomic_xchg_relaxed(v, new); #endif } /** * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg(v, old, new); #else return raw_atomic_cmpxchg(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_acquire(v, old, new); #else return raw_atomic_cmpxchg_acquire(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_release(v, old, new); #else return raw_atomic_cmpxchg_release(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_relaxed(v, old, new); #else return raw_atomic_cmpxchg_relaxed(v, old, new); #endif } /** * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_release(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new); #endif } /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_sub_and_test(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_and_test(i, v); #else return raw_atomic_sub_and_test(i, v); #endif } /** * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_and_test(v); #else return raw_atomic_dec_and_test(v); #endif } /** * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_and_test(v); #else return raw_atomic_inc_and_test(v); #endif } /** * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative(i, v); #else return raw_atomic_add_negative(i, v); #endif } /** * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_acquire(i, v); #else return raw_atomic_add_negative_acquire(i, v); #endif } /** * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_release(i, v); #else return raw_atomic_add_negative_release(i, v); #endif } /** * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_relaxed(i, v); #else return raw_atomic_add_negative_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u); #endif } /** * raw_atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_add_unless(v, a, u); #else return raw_atomic_add_unless(v, a, u); #endif } /** * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_not_zero(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_not_zero(v); #else return raw_atomic_inc_not_zero(v); #endif } /** * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_unless_negative(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_unless_negative(v); #else return raw_atomic_inc_unless_negative(v); #endif } /** * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_unless_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_unless_positive(v); #else return raw_atomic_dec_unless_positive(v); #endif } /** * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long raw_atomic_long_dec_if_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_if_positive(v); #else return raw_atomic_dec_if_positive(v); #endif } #endif /* _LINUX_ATOMIC_LONG_H */ // eadf183c3600b8b92b91839dd3be6bcc560c752d
4 4 3 3 3 1 2 1 3 4 4 4 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 // SPDX-License-Identifier: GPL-2.0 #include <linux/compat.h> #include <linux/console.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <linux/major.h> #include "fb_internal.h" /* * We hold a reference to the fb_info in file->private_data, * but if the current registered fb has changed, we don't * actually want to use it. * * So look up the fb_info using the inode minor number, * and just verify it against the reference we have. */ static struct fb_info *file_fb_info(struct file *file) { struct inode *inode = file_inode(file); int fbidx = iminor(inode); struct fb_info *info = registered_fb[fbidx]; if (info != file->private_data) info = NULL; return info; } static ssize_t fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_read)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_read(info, buf, count, ppos); } static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_write)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_write(info, buf, count, ppos); } static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { const struct fb_ops *fb; struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_cmap cmap_from; struct fb_cmap_user cmap; void __user *argp = (void __user *)arg; long ret = 0; switch (cmd) { case FBIOGET_VSCREENINFO: lock_fb_info(info); var = info->var; unlock_fb_info(info); ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0; break; case FBIOPUT_VSCREENINFO: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; /* only for kernel-internal use */ var.activate &= ~FB_ACTIVATE_KD_TEXT; console_lock(); lock_fb_info(info); ret = fbcon_modechange_possible(info, &var); if (!ret) ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); console_unlock(); if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; break; case FBIOGET_FSCREENINFO: lock_fb_info(info); memcpy(&fix, &info->fix, sizeof(fix)); if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0; break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; ret = fb_set_user_cmap(&cmap, info); break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); ret = fb_cmap_to_user(&cmap_from, &cmap); break; case FBIOPAN_DISPLAY: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; console_lock(); lock_fb_info(info); ret = fb_pan_display(info, &var); unlock_fb_info(info); console_unlock(); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) return -EFAULT; break; case FBIO_CURSOR: ret = -EINVAL; break; case FBIOGET_CON2FBMAP: ret = fbcon_get_con2fb_map_ioctl(argp); break; case FBIOPUT_CON2FBMAP: ret = fbcon_set_con2fb_map_ioctl(argp); break; case FBIOBLANK: if (arg > FB_BLANK_POWERDOWN) return -EINVAL; console_lock(); lock_fb_info(info); ret = fb_blank(info, arg); /* might again call into fb_blank */ fbcon_fb_blanked(info, arg); unlock_fb_info(info); console_unlock(); break; default: lock_fb_info(info); fb = info->fbops; if (fb->fb_ioctl) ret = fb->fb_ioctl(info, cmd, arg); else ret = -ENOTTY; unlock_fb_info(info); } return ret; } static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; return do_fb_ioctl(info, cmd, arg); } #ifdef CONFIG_COMPAT struct fb_fix_screeninfo32 { char id[16]; compat_caddr_t smem_start; u32 smem_len; u32 type; u32 type_aux; u32 visual; u16 xpanstep; u16 ypanstep; u16 ywrapstep; u32 line_length; compat_caddr_t mmio_start; u32 mmio_len; u32 accel; u16 reserved[3]; }; struct fb_cmap32 { u32 start; u32 len; compat_caddr_t red; compat_caddr_t green; compat_caddr_t blue; compat_caddr_t transp; }; static int fb_getput_cmap(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_cmap32 cmap32; struct fb_cmap cmap_from; struct fb_cmap_user cmap; if (copy_from_user(&cmap32, compat_ptr(arg), sizeof(cmap32))) return -EFAULT; cmap = (struct fb_cmap_user) { .start = cmap32.start, .len = cmap32.len, .red = compat_ptr(cmap32.red), .green = compat_ptr(cmap32.green), .blue = compat_ptr(cmap32.blue), .transp = compat_ptr(cmap32.transp), }; if (cmd == FBIOPUTCMAP) return fb_set_user_cmap(&cmap, info); lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); return fb_cmap_to_user(&cmap_from, &cmap); } static int do_fscreeninfo_to_user(struct fb_fix_screeninfo *fix, struct fb_fix_screeninfo32 __user *fix32) { __u32 data; int err; err = copy_to_user(&fix32->id, &fix->id, sizeof(fix32->id)); data = (__u32) (unsigned long) fix->smem_start; err |= put_user(data, &fix32->smem_start); err |= put_user(fix->smem_len, &fix32->smem_len); err |= put_user(fix->type, &fix32->type); err |= put_user(fix->type_aux, &fix32->type_aux); err |= put_user(fix->visual, &fix32->visual); err |= put_user(fix->xpanstep, &fix32->xpanstep); err |= put_user(fix->ypanstep, &fix32->ypanstep); err |= put_user(fix->ywrapstep, &fix32->ywrapstep); err |= put_user(fix->line_length, &fix32->line_length); data = (__u32) (unsigned long) fix->mmio_start; err |= put_user(data, &fix32->mmio_start); err |= put_user(fix->mmio_len, &fix32->mmio_len); err |= put_user(fix->accel, &fix32->accel); err |= copy_to_user(fix32->reserved, fix->reserved, sizeof(fix->reserved)); if (err) return -EFAULT; return 0; } static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_fix_screeninfo fix; lock_fb_info(info); fix = info->fix; if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); return do_fscreeninfo_to_user(&fix, compat_ptr(arg)); } static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); const struct fb_ops *fb; long ret = -ENOIOCTLCMD; if (!info) return -ENODEV; fb = info->fbops; switch (cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: case FBIOPAN_DISPLAY: case FBIOGET_CON2FBMAP: case FBIOPUT_CON2FBMAP: arg = (unsigned long) compat_ptr(arg); fallthrough; case FBIOBLANK: ret = do_fb_ioctl(info, cmd, arg); break; case FBIOGET_FSCREENINFO: ret = fb_get_fscreeninfo(info, cmd, arg); break; case FBIOGETCMAP: case FBIOPUTCMAP: ret = fb_getput_cmap(info, cmd, arg); break; default: if (fb->fb_compat_ioctl) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } return ret; } #endif static int fb_mmap(struct file *file, struct vm_area_struct *vma) { struct fb_info *info = file_fb_info(file); int res; if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_mmap)) return -ENODEV; mutex_lock(&info->mm_lock); res = info->fbops->fb_mmap(info, vma); mutex_unlock(&info->mm_lock); return res; } static int fb_open(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { int fbidx = iminor(inode); struct fb_info *info; int res = 0; info = get_fb_info(fbidx); if (!info) { request_module("fb%d", fbidx); info = get_fb_info(fbidx); if (!info) return -ENODEV; } if (IS_ERR(info)) return PTR_ERR(info); lock_fb_info(info); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; goto out; } file->private_data = info; if (info->fbops->fb_open) { res = info->fbops->fb_open(info, 1); if (res) module_put(info->fbops->owner); } #ifdef CONFIG_FB_DEFERRED_IO if (info->fbdefio) fb_deferred_io_open(info, inode, file); #endif out: unlock_fb_info(info); if (res) put_fb_info(info); return res; } static int fb_release(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { struct fb_info * const info = file->private_data; lock_fb_info(info); #if IS_ENABLED(CONFIG_FB_DEFERRED_IO) if (info->fbdefio) fb_deferred_io_release(info); #endif if (info->fbops->fb_release) info->fbops->fb_release(info, 1); module_put(info->fbops->owner); unlock_fb_info(info); put_fb_info(info); return 0; } #if defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && !defined(CONFIG_MMU) static unsigned long get_fb_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct fb_info * const info = filp->private_data; unsigned long fb_size = PAGE_ALIGN(info->fix.smem_len); if (pgoff > fb_size || len > fb_size - pgoff) return -EINVAL; return (unsigned long)info->screen_base + pgoff; } #endif static const struct file_operations fb_fops = { .owner = THIS_MODULE, .read = fb_read, .write = fb_write, .unlocked_ioctl = fb_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fb_compat_ioctl, #endif .mmap = fb_mmap, .open = fb_open, .release = fb_release, #if defined(HAVE_ARCH_FB_UNMAPPED_AREA) || \ (defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && \ !defined(CONFIG_MMU)) .get_unmapped_area = get_fb_unmapped_area, #endif #ifdef CONFIG_FB_DEFERRED_IO .fsync = fb_deferred_io_fsync, #endif .llseek = default_llseek, }; int fb_register_chrdev(void) { int ret; ret = register_chrdev(FB_MAJOR, "fb", &fb_fops); if (ret) { pr_err("Unable to get major %d for fb devs\n", FB_MAJOR); return ret; } return ret; } void fb_unregister_chrdev(void) { unregister_chrdev(FB_MAJOR, "fb"); }
3 4 3 3 1 6 2 2 2 1 1 1 30 1 5 2 2 1 2 3 1 1 4 6 7 17 17 10 17 17 5 5 5 4 7 1 1 3 3 3 3 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; struct bpf_link *msg_parser_link; struct bpf_link *stream_parser_link; struct bpf_link *stream_verdict_link; struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); /* psock_update_sk_prot may be called with restore=false many times * so the handler must be safe for this case. It will be called * exactly once with restore=true when the psock is being destroyed * and psock refcnt is zero, but before an RCU grace period. */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct sock *sk_pair; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */
158 157 157 230 226 230 192 152 5 5 5 4 255 258 260 27 26 27 26 155 156 2 32 33 3 106 107 102 103 2 2 2 2 85 82 84 84 84 33 1052 25 187 164 186 960 1058 1 1 1 861 2 879 865 33 28 28 32 9 10 246 249 11 248 245 9 9 1 9 78 78 78 78 1 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/uaccess.h> #include "internal.h" #include "swap.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ noinline char *kstrdup(const char *s, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strlen(s) + 1; buf = kmalloc_track_caller(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strnlen(s, max); buf = kmalloc_track_caller(len+1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup_noprof); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. * @count: number of elements to duplicate from array. * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result may be not physically contiguous. Use kvfree() to free. */ void *kvmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kvmalloc(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kvmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; if (!s) return NULL; buf = kmalloc_track_caller(len + 1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kmemdup_nul); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kvmalloc(len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; /* * Always use GFP_KERNEL, since copy_from_user() can sleep and * cause pagefault, which makes it pointless to use GFP_NOFS * or GFP_ATOMIC. */ p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* * Change backing file, only valid to use during initial VMA setup. */ void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ get_file(file); swap(vma->vm_file, file); fput(file); } EXPORT_SYMBOL(vma_set_file); #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. * @range: The size of the area, starting at @start, within which the * random address must fall. * * If @start + @range would overflow, @range is capped. * * NOTE: Historical use of randomize_range, which this replaces, presumed that * @start was already page aligned. We now align it regardless. * * Return: A page aligned address within [start, start + range). On error, * @start is returned. */ unsigned long randomize_page(unsigned long start, unsigned long range) { if (!PAGE_ALIGNED(start)) { range -= PAGE_ALIGN(start) - start; start = PAGE_ALIGN(start); } if (start > ULONG_MAX - range) range = ULONG_MAX - start; range >>= PAGE_SHIFT; if (range == 0) return start; return start + (get_random_long() % range << PAGE_SHIFT); } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; /* On parisc the stack always grows up - so a unlimited stack should * not be an indicator to use the legacy memory layout. */ if (rlim_stack->rlim_cur == RLIM_INFINITY && !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* * For an upwards growing stack the calculation is much simpler. * Memory for the maximum stack size is reserved at the top of the * task. mmap_base starts directly below the stack and grows * downwards. */ return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); #else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); #endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; clear_bit(MMF_TOPDOWN, &mm->flags); } #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); /** * kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. * However make sure that larger requests are not too disruptive - no * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; /* nofail semantic is implemented by the vmalloc fallback */ kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node_noprof(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ if (ret || size <= PAGE_SIZE) return ret; /* non-sleeping allocations are not supported by vmalloc */ if (!gfpflags_allow_blocking(flags)) return NULL; /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything * about the resulting pointer, and cannot play * protection games. */ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node_noprof); /** * kvfree() - Free memory. * @addr: Pointer to allocated memory. * * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); } EXPORT_SYMBOL(kvfree); /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. * @len: length of the data object. * * Use the special memzero_explicit() function to clear the content of a * kvmalloc'ed object containing sensitive data to make sure that the * compiler won't optimize out the data clearing. */ void kvfree_sensitive(const void *addr, size_t len) { if (likely(!ZERO_OR_NULL_PTR(addr))) { memzero_explicit((void *)addr, len); kvfree(addr); } } EXPORT_SYMBOL(kvfree_sensitive); void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; if (oldsize >= newsize) return (void *)p; newp = kvmalloc_noprof(newsize, flags); if (!newp) return NULL; memcpy(newp, p, oldsize); kvfree(p); return newp; } EXPORT_SYMBOL(kvrealloc_noprof); /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return __vmalloc_noprof(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; return (void *)(mapping - PAGE_MAPPING_ANON); } /** * folio_mapping - Find the mapping where this folio is stored. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Folios in the swap cache return the swap mapping * this page is stored in (which is different from the mapping for the * swap file or swap device where the data is stored). * * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; } EXPORT_SYMBOL(folio_mapping); /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. * * The bytes in the folio represented by @src are copied to @dst. * Assumes the caller has validated that @dst is at least as large as @src. * Can be called in atomic context for order-0 folios, but if the folio is * larger, it may sleep. */ void folio_copy(struct folio *dst, struct folio *src) { long i = 0; long nr = folio_nr_pages(src); for (;;) { copy_highpage(folio_page(dst, i), folio_page(src, i)); if (++i == nr) break; cond_resched(); } } EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: bytes_failed = pages << PAGE_SHIFT; pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_local_page(page1); addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_local(addr2); kunmap_local(addr1); return ret; } #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation * and last free path of that object. */ void mem_dump_obj(void *object) { const char *type; if (kmem_dump_obj(object)) return; if (vmalloc_dump_obj(object)) return; if (is_vmalloc_addr(object)) type = "vmalloc memory"; else if (virt_addr_valid(object)) type = "non-slab/vmalloc memory"; else if (object == NULL) type = "NULL pointer"; else if (object == ZERO_SIZE_PTR) type = "zero-size pointer"; else type = "non-paged memory"; pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif /* * A driver might set a page logically offline -- PageOffline() -- and * turn the page inaccessible in the hypervisor; after that, access to page * content can be fatal. * * Some special PFN walkers -- i.e., /proc/kcore -- read content of random * pages after checking PageOffline(); however, these PFN walkers can race * with drivers that set PageOffline(). * * page_offline_freeze()/page_offline_thaw() allows for a subsystem to * synchronize with such drivers, achieving that a page cannot be set * PageOffline() while frozen. * * page_offline_begin()/page_offline_end() is used by drivers that care about * such races when setting a page PageOffline(). */ static DECLARE_RWSEM(page_offline_rwsem); void page_offline_freeze(void) { down_read(&page_offline_rwsem); } void page_offline_thaw(void) { up_read(&page_offline_rwsem); } void page_offline_begin(void) { down_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_begin); void page_offline_end(void) { up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); #ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) flush_dcache_page(folio_page(folio, i)); } EXPORT_SYMBOL(flush_dcache_folio); #endif
3 3 3 3 2 2 3 3 3 3 3 2 3 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/fs.h> #include <linux/iomap.h> #include "trace.h" /* * Advance to the next range we need to map. * * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully * processed - it was aborted because the extent the iomap spanned may have been * changed during the operation. In this case, the iteration behaviour is to * remap the unprocessed range of the iter, and that means we may need to remap * even when we've made no progress (i.e. iter->processed = 0). Hence the * "finished iterating" case needs to distinguish between * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we * need to remap the entire remaining range. */ static inline int iomap_iter_advance(struct iomap_iter *iter) { bool stale = iter->iomap.flags & IOMAP_F_STALE; /* handle the previous iteration (if any) */ if (iter->iomap.length) { if (iter->processed < 0) return iter->processed; if (!iter->processed && !stale) return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; iter->len -= iter->processed; if (!iter->len) return 0; } /* clear the state for the next iteration */ iter->processed = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); return 1; } static inline void iomap_iter_done(struct iomap_iter *iter) { WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); } /** * iomap_iter - iterate over a ranges in a file * @iter: iteration structue * @ops: iomap ops provided by the file system * * Iterate over filesystem-provided space mappings for the provided file range. * * This function handles cleanup of resources acquired for iteration when the * filesystem indicates there are no more space mappings, which means that this * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out * of the loop body: leave @iter.processed unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { int ret; if (iter->iomap.length && ops->iomap_end) { ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), iter->processed > 0 ? iter->processed : 0, iter->flags, &iter->iomap); if (ret < 0 && !iter->processed) return ret; } trace_iomap_iter(iter, ops, _RET_IP_); ret = iomap_iter_advance(iter); if (ret <= 0) return ret; ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) return ret; iomap_iter_done(iter); return 1; }
2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/resize.c * * Support for resizing an ext4 filesystem while it is mounted. * * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> * * This could probably be made into a module, because it is not often in use. */ #include <linux/errno.h> #include <linux/slab.h> #include <linux/jiffies.h> #include "ext4_jbd2.h" struct ext4_rcu_ptr { struct rcu_head rcu; void *ptr; }; static void ext4_rcu_ptr_callback(struct rcu_head *head) { struct ext4_rcu_ptr *ptr; ptr = container_of(head, struct ext4_rcu_ptr, rcu); kvfree(ptr->ptr); kfree(ptr); } void ext4_kvfree_array_rcu(void *to_free) { struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); if (ptr) { ptr->ptr = to_free; call_rcu(&ptr->rcu, ext4_rcu_ptr_callback); return; } synchronize_rcu(); kvfree(to_free); } int ext4_resize_begin(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); int ret = 0; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; /* * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; } /* * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } /* * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; } if (ext4_has_feature_sparse_super2(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); return -EOPNOTSUPP; } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags)) ret = -EBUSY; return ret; } int ext4_resize_end(struct super_block *sb, bool update_backups) { clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); smp_mb__after_atomic(); if (update_backups) return ext4_update_overhead(sb, true); return 0; } static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; overhead = ext4_bg_num_gdb(sb, group); if (ext4_bg_has_super(sb, group)) overhead += 1 + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); return overhead; } #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) static int verify_group_input(struct super_block *sb, struct ext4_new_group_data *input) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t start = ext4_blocks_count(es); ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; unsigned overhead; ext4_fsblk_t metaend; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; int err = -EINVAL; if (group != sbi->s_groups_count) { ext4_warning(sb, "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); return -EINVAL; } overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " "(%d free, %u reserved)\n", ext4_bg_has_super(sb, input->group) ? "normal" : "no-super", input->group, input->blocks_count, free_blocks_count, input->reserved_blocks); ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (offset != 0) ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext4_warning(sb, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { err = PTR_ERR(bh); bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) ext4_warning(sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) ext4_warning(sb, "Block bitmap (%llu) in inode table " "(%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) ext4_warning(sb, "Inode bitmap (%llu) in inode table " "(%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " "(%llu-%llu)", (unsigned long long)input->inode_table, itend - 1, start, metaend - 1); else err = 0; brelse(bh); return err; } /* * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex * group each time. */ struct ext4_new_flex_group_data { struct ext4_new_group_data *groups; /* new_group_data for groups in the flex group */ __u16 *bg_flags; /* block group flags of groups in @groups */ ext4_group_t resize_bg; /* number of allocated new_group_data */ ext4_group_t count; /* number of groups in @groups */ }; /* * Avoiding memory allocation failures due to too many groups added each time. */ #define MAX_RESIZE_BG 16384 /* * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of * @flexbg_size. * * Returns NULL on failure otherwise address of the allocated structure. */ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, ext4_group_t o_group, ext4_group_t n_group) { ext4_group_t last_group; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); if (flex_gd == NULL) goto out3; if (unlikely(flexbg_size > MAX_RESIZE_BG)) flex_gd->resize_bg = MAX_RESIZE_BG; else flex_gd->resize_bg = flexbg_size; /* Avoid allocating large 'groups' array if not needed */ last_group = o_group | (flex_gd->resize_bg - 1); if (n_group <= last_group) flex_gd->resize_bg = 1 << fls(n_group - o_group + 1); else if (n_group - last_group < flex_gd->resize_bg) flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1), fls(n_group - last_group)); flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); if (flex_gd->groups == NULL) goto out2; flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; return flex_gd; out1: kfree(flex_gd->groups); out2: kfree(flex_gd); out3: return NULL; } static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) { kfree(flex_gd->bg_flags); kfree(flex_gd->groups); kfree(flex_gd); } /* * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps * and inode tables for a flex group. * * This function is used by 64bit-resize. Note that this function allocates * group tables from the 1st group of groups contained by @flexgd, which may * be a partial of a flex group. * * @sb: super block of fs to which the groups belongs * * Returns 0 on a successful allocation of the metadata blocks in the * block group. */ static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, unsigned int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t start_blk; ext4_fsblk_t last_blk; ext4_group_t src_group; ext4_group_t bb_index = 0; ext4_group_t ib_index = 0; ext4_group_t it_index = 0; ext4_group_t group; ext4_group_t last_group; unsigned overhead; __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); src_group = group_data[0].group; last_group = src_group + flex_gd->count - 1; BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != (last_group & ~(flexbg_size - 1)))); next_group: group = group_data[0].group; if (src_group >= group_data[0].group + flex_gd->count) return -ENOSPC; start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; overhead = ext4_group_overhead_blocks(sb, src_group); start_blk += overhead; /* We collect contiguous blocks as much as possible. */ src_group++; for (; src_group <= last_group; src_group++) { overhead = ext4_group_overhead_blocks(sb, src_group); if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; } /* Allocate block bitmaps */ for (; bb_index < flex_gd->count; bb_index++) { if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ for (; ib_index < flex_gd->count; ib_index++) { if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { unsigned int itb = EXT4_SB(sb)->s_itb_per_group; ext4_fsblk_t next_group_start; if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; group = ext4_get_group_number(sb, start_blk); next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; if (start_blk + itb > next_group_start) { flex_gd->bg_flags[group + 1] &= uninit_mask; overhead = start_blk + itb - next_group_start; group_data[group + 1].mdata_blocks += overhead; itb -= overhead; } group_data[group].mdata_blocks += itb; flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } /* Update free clusters count to exclude metadata blocks */ for (i = 0; i < flex_gd->count; i++) { group_data[i].free_clusters_count -= EXT4_NUM_B2C(EXT4_SB(sb), group_data[i].mdata_blocks); } if (test_opt(sb, DEBUG)) { int i; group = group_data[0].group; printk(KERN_DEBUG "EXT4-fs: adding a flex group with " "%u groups, flexbg size is %u:\n", flex_gd->count, flexbg_size); for (i = 0; i < flex_gd->count; i++) { ext4_debug( "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, group_data[i].free_clusters_count, group_data[i].mdata_blocks); } } return 0; } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { struct buffer_head *bh; int err; bh = sb_getblk(sb, blk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); bh = ERR_PTR(err); } else { memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); } return bh; } static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { return ext4_journal_ensure_credits_fn(handle, credits, EXT4_MAX_TRANS_DATA, 0, 0); } /* * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. * * Helper function for ext4_setup_new_group_blocks() which set . * * @sb: super block * @handle: journal handle * @flex_gd: flex group data */ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, struct ext4_new_flex_group_data *flex_gd, ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t count = last_cluster - first_cluster + 1; ext4_group_t count2; ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; int err; group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); group -= flex_gd->groups[0].group; count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); if (count2 > count) count2 = count; if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { BUG_ON(flex_gd->count > 1); continue; } err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); if (unlikely(!bh)) return -ENOMEM; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", first_cluster, first_cluster - start, count2); mb_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (unlikely(err)) return err; } return 0; } /* * Set up the block and inode bitmaps, and the inode table for the new groups. * This doesn't need to be part of the main transaction, since we are only * changing blocks outside the actual filesystem. We still do journaling to * ensure the recovery is correct in case of a failure just after resize. * If any part of this fails, we simply abort the resize. * * setup_new_flex_group_blocks handles a flex group as follow: * 1. copy super block and GDT, and initialize group tables if necessary. * In this step, we only set bits in blocks bitmaps for blocks taken by * super block and GDT. * 2. allocate group tables in block bitmaps, that is, set bits in block * bitmap for blocks taken by group tables. */ static int setup_new_flex_group_blocks(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; ext4_fsblk_t start; ext4_fsblk_t block; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; __u16 *bg_flags = flex_gd->bg_flags; handle_t *handle; ext4_group_t group, count; struct buffer_head *bh = NULL; int reserved_gdb, i, j, err = 0, err2; int meta_bg; BUG_ON(!flex_gd->count || !group_data || group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); meta_bg = ext4_has_feature_meta_bg(sb); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) return PTR_ERR(handle); group = group_data[0].group; for (i = 0; i < flex_gd->count; i++, group++) { unsigned long gdblocks; ext4_grpblk_t overhead; gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; if (meta_bg == 1) goto handle_itb; block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ for (j = 0; j < gdblocks; j++, block++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; gdb = sb_getblk(sb, block); if (unlikely(!gdb)) { err = -ENOMEM; goto out; } BUFFER_TRACE(gdb, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb, EXT4_JTR_NONE); if (err) { brelse(gdb); goto out; } memcpy(gdb->b_data, sbi_array_rcu_deref(sbi, s_group_desc, j)->b_data, gdb->b_size); set_buffer_uptodate(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); goto out; } brelse(gdb); } /* Zero out all of the reserved backup group descriptor * table blocks */ if (ext4_bg_has_super(sb, group)) { err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) goto out; } handle_itb: /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; /* Zero out all of the inode table blocks */ block = group_data[i].inode_table; ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto out; handle_bb: if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) goto handle_ib; /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); goto out; } overhead = ext4_group_overhead_blocks(sb, group); if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); mb_set_bits(bh->b_data, 0, EXT4_NUM_B2C(sbi, overhead)); } ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (err) goto out; handle_ib: if (bg_flags[i] & EXT4_BG_INODE_UNINIT) continue; /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); goto out; } ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (err) goto out; } /* Mark group tables in block bitmap */ for (j = 0; j < GROUP_TABLE_COUNT; j++) { count = group_table_count[j]; start = (&group_data[0].block_bitmap)[j]; block = start; for (i = 1; i < flex_gd->count; i++) { block += group_table_count[j]; if (block == (&group_data[i].block_bitmap)[j]) { count += group_table_count[j]; continue; } err = set_flexbg_block_bitmap(sb, handle, flex_gd, EXT4_B2C(sbi, start), EXT4_B2C(sbi, start + count - 1)); if (err) goto out; count = group_table_count[j]; start = (&group_data[i].block_bitmap)[j]; block = start; } err = set_flexbg_block_bitmap(sb, handle, flex_gd, EXT4_B2C(sbi, start), EXT4_B2C(sbi, start + count - 1)); if (err) goto out; } out: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; return err; } /* * Iterate through the groups which hold BACKUP superblock/GDT copies in an * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before * calling this for the first time. In a sparse filesystem it will be the * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... */ unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned int *min = three; int mult = 3; unsigned int ret; if (ext4_has_feature_sparse_super2(sb)) { do { if (*min > 2) return UINT_MAX; ret = le32_to_cpu(es->s_backup_bgs[*min - 1]); *min += 1; } while (!ret); return ret; } if (!ext4_has_feature_sparse_super(sb)) { ret = *min; *min += 1; return ret; } if (*five < *min) { min = five; mult = 5; } if (*seven < *min) { min = seven; mult = 7; } ret = *min; *min *= mult; return ret; } /* * Check that all of the backup GDT blocks are held in the primary GDT block. * It is assumed that they are stored in group order. Returns the number of * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; unsigned three = 1; unsigned five = 5; unsigned seven = 7; unsigned grp; __le32 *p = (__le32 *)primary->b_data; int gdbackups = 0; while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ ext4_warning(sb, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, grp * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + blk); return -EINVAL; } if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) return -EFBIG; } return gdbackups; } /* * Called when we need to bring a reserved group descriptor table block into * use from the resize inode. The primary copy of the new GDT block currently * is an indirect block (under the double indirect block in the resize inode). * The new backup GDT blocks will be stored as leaf blocks in this indirect * block, in group order. Even though we know all the block numbers we need, * we check to ensure that the resize inode has actually reserved these blocks. * * Don't need to update the block bitmaps because the blocks are still in use. * * We get all of the error cases out of the way, so that we are sure to not * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc = NULL; struct buffer_head *dind = NULL; struct buffer_head *gdb_bh = NULL; int gdbackups; struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); if (IS_ERR(dind)) { err = PTR_ERR(dind); dind = NULL; goto errout; } data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; } /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) goto errout; n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); goto errout; } /* * Finally, we have all of the possible failures behind us... * * Remove new GDT block from inode double-indirect block and clear out * the new GDT block for use (which also "frees" the backup GDT blocks * from the reserved inode). We don't need to change the bitmaps for * these blocks, because they are marked as in-use from being in the * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); memset(gdb_bh->b_data, 0, sb->s_blocksize); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); iloc.bh = NULL; goto errout; } brelse(dind); rcu_read_lock(); o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; ext4_kvfree_array_rcu(o_group_desc); lock_buffer(EXT4_SB(sb)->s_sbh); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); if (err) ext4_std_error(sb, err); return err; errout: kvfree(n_group_desc); brelse(iloc.bh); brelse(dind); brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; } /* * If there is no available space in the existing block group descriptors for * the new block group and there are no reserved block group descriptors, then * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set * to the first block group that is managed using meta_bg and s_first_meta_bg * must be a multiple of EXT4_DESC_PER_BLOCK(sb). * This function will be called when first group of meta_bg is added to bring * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { ext4_fsblk_t gdblock; struct buffer_head *gdb_bh; struct buffer_head **o_group_desc, **n_group_desc; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; gdblock = ext4_group_first_block_no(sb, group) + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); return err; } rcu_read_lock(); o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (err) { kvfree(n_group_desc); brelse(gdb_bh); return err; } rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; ext4_kvfree_array_rcu(o_group_desc); return err; } /* * Called when we are adding a new group which has a backup copy of each of * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. * We need to add these reserved backup GDT blocks to the resize inode, so * that they are kept for future resizing and not allocated to files. * * Each reserved backup GDT block will go into a different indirect block. * The indirect blocks are actually the primary reserved GDT blocks, * so we know in advance what their block numbers are. We only get the * double-indirect block to verify it is pointing to the primary reserved * GDT blocks so we don't overwrite a data block by accident. The reserved * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); int cluster_bits = EXT4_SB(sb)->s_cluster_bits; struct buffer_head **primary; struct buffer_head *dind; struct ext4_iloc iloc; ext4_fsblk_t blk; __le32 *data, *end; int gdbackups = 0; int res, i; int err; primary = kmalloc_array(reserved_gdb, sizeof(*primary), GFP_NOFS); if (!primary) return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); if (IS_ERR(dind)) { err = PTR_ERR(dind); dind = NULL; goto exit_free; } blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % EXT4_ADDR_PER_BLOCK(sb)); end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { ext4_warning(sb, "reserved block %llu" " not at offset %ld", blk, (long)(data - (__le32 *)dind->b_data)); err = -EINVAL; goto exit_bh; } primary[res] = ext4_sb_bread(sb, blk, 0); if (IS_ERR(primary[res])) { err = PTR_ERR(primary[res]); primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; } if (++data >= end) data = (__le32 *)dind->b_data; } for (i = 0; i < reserved_gdb; i++) { BUFFER_TRACE(primary[i], "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, primary[i], EXT4_JTR_NONE))) goto exit_bh; } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) goto exit_bh; /* * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) err = err2; } inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); exit_bh: while (--res >= 0) brelse(primary[res]); brelse(dind); exit_free: kfree(primary); return err; } static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, ext4_group_t group) { struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); if (ext4_has_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); } /* * Update the backup copies of the ext4 metadata. These don't need to be part * of the main resize transaction, because e2fsck will re-write them if there * is a problem (basically only OOM will cause a problem). However, we * _should_ update the backups if possible, in case the primary gets trashed * for some reason and we need to run e2fsck from a backup superblock. The * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * * We do not need take the s_resize_lock for this, because these * blocks are not otherwise touched by the filesystem code when it is * mounted. We don't need to worry about last changing from * sbi->s_groups_count, because the worst that can happen is that we * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t last; const int bpg = EXT4_BLOCKS_PER_GROUP(sb); unsigned three = 1; unsigned five = 5; unsigned seven = 7; ext4_group_t group = 0; int rest = sb->s_blocksize - size; handle_t *handle; int err = 0, err2; handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) { group = 1; err = PTR_ERR(handle); goto exit_err; } if (meta_bg == 0) { group = ext4_list_backups(sb, &three, &five, &seven); last = sbi->s_groups_count; } else { group = ext4_get_group_number(sb, blk_off) + 1; last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); } while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; int has_super = ext4_bg_has_super(sb, group); ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) break; if (meta_bg == 0) backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else backup_block = first_block + has_super; bh = sb_getblk(sb, backup_block); if (unlikely(!bh)) { err = -ENOMEM; break; } ext4_debug("update metadata backup %llu(+%llu)\n", backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE))) { brelse(bh); break; } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); if (has_super && (backup_block == first_block)) ext4_set_block_group_nr(sb, bh->b_data, group); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (unlikely(err)) ext4_std_error(sb, err); brelse(bh); if (meta_bg == 0) group = ext4_list_backups(sb, &three, &five, &seven); else if (group == last) break; else group = last; } if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; /* * Ugh! Need to have e2fsck write the backup copies. It is too * late to revert the resize, we shouldn't fail just because of * the backup copies (they are only needed in case of corruption). * * However, if we got here we have a journal problem too, so we * can't really start a transaction to mark the superblock. * Chicken out and just set the flag on the hope it will be written * to disk, and if not - we will simply wait until next fsck. */ exit_err: if (err) { ext4_warning(sb, "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); mark_buffer_dirty(sbi->s_sbh); } } /* * ext4_add_new_descs() adds @count group descriptor of groups * starting at @group * * @handle: journal handle * @sb: super block * @group: the group no. of the first group desc to be added * @resize_inode: the resize inode * @count: number of group descriptors to be added */ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, ext4_group_t group, struct inode *resize_inode, ext4_group_t count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *gdb_bh; int i, gdb_off, gdb_num, err = 0; int meta_bg; meta_bg = ext4_has_feature_meta_bg(sb); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; gdb_off = group % EXT4_DESC_PER_BLOCK(sb); gdb_num = group / EXT4_DESC_PER_BLOCK(sb); /* * We will only either add reserved group blocks to a backup group * or remove reserved blocks for the first group in a new group block. * Doing both would be mean more complex code, and sane people don't * use non-sparse filesystems anymore. This is already checked above. */ if (gdb_off) { gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); } else if (meta_bg != 0) { err = add_new_gdb_meta_bg(sb, handle, group); } else { err = add_new_gdb(handle, resize_inode, group); } if (err) break; } return err; } static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) { struct buffer_head *bh = sb_getblk(sb, block); if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL) < 0) { brelse(bh); return NULL; } } return bh; } static int ext4_set_bitmap_checksums(struct super_block *sb, struct ext4_group_desc *gdp, struct ext4_new_group_data *group_data) { struct buffer_head *bh; if (!ext4_has_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; ext4_inode_bitmap_csum_set(sb, gdp, bh, EXT4_INODES_PER_GROUP(sb) / 8); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); if (!bh) return -EIO; ext4_block_bitmap_csum_set(sb, gdp, bh); brelse(bh); return 0; } /* * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg */ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { struct ext4_new_group_data *group_data = flex_gd->groups; struct ext4_group_desc *gdp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *gdb_bh; ext4_group_t group; __u16 *bg_flags = flex_gd->bg_flags; int i, gdb_off, gdb_num, err = 0; for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { group = group_data->group; gdb_off = group % EXT4_DESC_PER_BLOCK(sb); gdb_num = group / EXT4_DESC_PER_BLOCK(sb); /* * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). */ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); err = ext4_set_bitmap_checksums(sb, gdp, group_data); if (err) { ext4_std_error(sb, err); break; } ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, group_data->free_clusters_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); ext4_group_desc_csum_set(sb, group, gdp); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); break; } /* * We can allocate memory for mb_alloc based on the new group * descriptor */ err = ext4_mb_add_groupinfo(sb, group, gdp); if (err) break; } return err; } static void ext4_add_overhead(struct super_block *sb, const ext4_fsblk_t overhead) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; sbi->s_overhead += overhead; es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); smp_wmb(); } /* * ext4_update_super() updates the super block so that the newly added * groups can be seen by the filesystem. * * @sb: super block * @flex_gd: new added groups */ static void ext4_update_super(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { ext4_fsblk_t blocks_count = 0; ext4_fsblk_t free_blocks = 0; ext4_fsblk_t reserved_blocks = 0; struct ext4_new_group_data *group_data = flex_gd->groups; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, * all of its blocks and inodes are already valid. * * We always allocate group-by-group, then block-by-block or * inode-by-inode within a group, so enabling these * blocks/inodes before the group is live won't actually let us * allocate the new space yet. */ for (i = 0; i < flex_gd->count; i++) { blocks_count += group_data[i].blocks_count; free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); } reserved_blocks = ext4_r_blocks_count(es) * 100; reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); lock_buffer(sbi->s_sbh); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); /* * We need to protect s_groups_count against other CPUs seeing * inconsistent state in the superblock. * * The precise rules we use are: * * * Writers must perform a smp_wmb() after updating all * dependent data and before modifying the groups count * * * Readers must perform an smp_rmb() after reading the groups * count and before reading any dependent data. * * NB. These rules can be relaxed when checking the group count * while freeing data, as we can only allocate from a block * group after serialising against the group count, and we can * only then free after serialising in turn against that * allocation. */ smp_wmb(); /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, EXT4_NUM_B2C(sbi, free_blocks)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); ext4_debug("free blocks count %llu", percpu_counter_read(&sbi->s_freeclusters_counter)); if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; struct flex_groups *fg; flex_group = ext4_flex_group(sbi, group_data[0].group); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), &fg->free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &fg->free_inodes); } /* * Update the fs overhead information. * * For bigalloc, if the superblock already has a properly calculated * overhead, update it with a value based on numbers already computed * above for the newly allocated capacity. */ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0)) ext4_add_overhead(sb, EXT4_NUM_B2C(sbi, blocks_count - free_blocks)); else ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, blocks_count, free_blocks, reserved_blocks); } /* Add a flex group to an fs. Ensure we handle all possible error conditions * _before_ we start modifying the filesystem, because we cannot abort the * transaction and not have it write the data to disk. */ static int ext4_flex_group_add(struct super_block *sb, struct inode *resize_inode, struct ext4_new_flex_group_data *flex_gd) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t o_blocks_count; ext4_grpblk_t last; ext4_group_t group; handle_t *handle; unsigned reserved_gdb; int err = 0, err2 = 0, credit; BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); o_blocks_count = ext4_blocks_count(es); ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); BUG_ON(last); err = setup_new_flex_group_blocks(sb, flex_gd); if (err) goto exit; /* * We will always be modifying at least the superblock and GDT * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ credit = 3; /* sb, resize inode, resize inode dindirect */ /* GDT blocks */ credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto exit; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto exit_journal; group = flex_gd->groups[0].group; BUG_ON(group != sbi->s_groups_count); err = ext4_add_new_descs(handle, sb, group, resize_inode, flex_gd->count); if (err) goto exit_journal; err = ext4_setup_new_descs(handle, sb, flex_gd); if (err) goto exit_journal; ext4_update_super(sb, flex_gd); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); exit_journal: err2 = ext4_journal_stop(handle); if (!err) err = err2; if (!err) { int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); int meta_bg = ext4_has_feature_meta_bg(sb) && gdb_num >= le32_to_cpu(es->s_first_meta_bg); sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - ext4_group_first_block_no(sb, 0); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); update_backups(sb, gdb_bh->b_blocknr - padding_blocks, gdb_bh->b_data, gdb_bh->b_size, meta_bg); } } exit: return err; } static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, ext4_fsblk_t n_blocks_count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t o_blocks_count; ext4_group_t n_group; ext4_group_t group; ext4_group_t last_group; ext4_grpblk_t last; ext4_grpblk_t clusters_per_group; unsigned long i; clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); o_blocks_count = ext4_blocks_count(es); if (o_blocks_count == n_blocks_count) return 0; ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); BUG_ON(last); ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); last_group = group | (flex_gd->resize_bg - 1); if (last_group > n_group) last_group = n_group; flex_gd->count = last_group - group + 1; for (i = 0; i < flex_gd->count; i++) { int overhead; group_data[i].group = group + i; group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].mdata_blocks = overhead; group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; if (!test_opt(sb, INIT_INODE_TABLE)) flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; } else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } if (last_group == n_group && ext4_has_group_desc_csum(sb)) /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; if ((last_group == n_group) && (last != clusters_per_group - 1)) { group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); group_data[i - 1].free_clusters_count -= clusters_per_group - last - 1; } return 1; } /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it * write the data to disk. * * If we are on a GDT block boundary, we need to get the reserved GDT block. * Otherwise, we may need to add backup GDT blocks for a sparse group. * * We only need to hold the superblock lock while we are actually adding * in the new group's counts to the superblock. Prior to that we have * not really "added" the group at all. We re-check that we are still * adding in the last group in case things have changed since verifying. */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; struct inode *inode = NULL; int gdb_off; int err; __u16 bg_flags = 0; gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) { ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { ext4_warning(sb, "inodes_count overflow"); return -EINVAL; } if (reserved_gdb || gdb_off == 0) { if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); } } err = verify_group_input(sb, input); if (err) goto out; err = ext4_alloc_flex_bg_array(sb, input->group + 1); if (err) goto out; err = ext4_mb_alloc_groupinfo(sb, input->group + 1); if (err) goto out; flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; err = ext4_flex_group_add(sb, inode, &flex_gd); out: iput(inode); return err; } /* ext4_group_add */ /* * extend a group without checking assuming that checking has been done. */ static int ext4_group_extend_no_check(struct super_block *sb, ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; handle_t *handle; int err = 0, err2; /* We will update the superblock, one block bitmap, and * one group descriptor via ext4_group_add_blocks(). */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_warning(sb, "error %d on journal start", err); return err; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) { ext4_warning(sb, "error %d on journal write access", err); goto errout; } lock_buffer(EXT4_SB(sb)->s_sbh); ext4_blocks_count_set(es, o_blocks_count + add); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) goto errout; ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); errout: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; if (!err) { if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); } return err; } /* * Extend the filesystem to the new number of blocks specified. This entry * point is only used to extend the current filesystem to the end of the last * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" * for emergencies (because it has no dependencies on reserved blocks). * * If we _really_ wanted, we could use default values to call ext4_group_add() * allow the "remount" trick to work for arbitrary resizing, assuming enough * GDT blocks are reserved to grow to the desired size. */ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count) { ext4_fsblk_t o_blocks_count; ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "extending last group from %llu to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ext4_msg(sb, KERN_ERR, "filesystem too large to resize to %llu blocks safely", n_blocks_count); return -EINVAL; } if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); return -EINVAL; } /* Handle the remaining blocks in the last group only. */ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { ext4_warning(sb, "need to use ext2online to resize further"); return -EPERM; } add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (o_blocks_count + add > n_blocks_count) add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) ext4_warning(sb, "will only finish group (%llu blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0); if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); return ext4_group_extend_no_check(sb, o_blocks_count, add); } /* ext4_group_extend */ static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) { return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); } /* * Release the resize inode and drop the resize_inode feature if there * are no more reserved gdt blocks, and then convert the file system * to enable meta_bg */ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) { handle_t *handle; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); ext4_fsblk_t nr; int i, ret, err = 0; int credits = 1; ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); if (inode) { if (es->s_reserved_gdt_blocks) { ext4_error(sb, "Unexpected non-zero " "s_reserved_gdt_blocks"); return -EPERM; } /* Do a quick sanity check of the resize inode */ if (inode->i_blocks != 1 << (inode->i_blkbits - (9 - sbi->s_cluster_bits))) goto invalid_resize_inode; for (i = 0; i < EXT4_N_BLOCKS; i++) { if (i == EXT4_DIND_BLOCK) { if (ei->i_data[i]) continue; else goto invalid_resize_inode; } if (ei->i_data[i]) goto invalid_resize_inode; } credits += 3; /* block bitmap, bg descriptor, resize inode */ } handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto errout; lock_buffer(sbi->s_sbh); ext4_clear_feature_resize_inode(sb); ext4_set_feature_meta_bg(sb); sbi->s_es->s_first_meta_bg = cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); if (err) { ext4_std_error(sb, err); goto errout; } if (inode) { nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); ei->i_data[EXT4_DIND_BLOCK] = 0; inode->i_blocks = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) ext4_std_error(sb, err); } errout: ret = ext4_journal_stop(handle); return err ? err : ret; invalid_resize_inode: ext4_error(sb, "corrupted/inconsistent resize inode"); return -EINVAL; } /* * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count * * @sb: super block of the fs to be resized * @n_blocks_count: the number of blocks resides in the resized fs */ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) { struct ext4_new_flex_group_data *flex_gd = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *bh; struct inode *resize_inode = NULL; ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; ext4_group_t o_group; ext4_group_t n_group; ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; unsigned long last_update_time = 0; int err = 0; int meta_bg; unsigned int flexbg_size = ext4_flex_bg_size(sbi); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); /* * For bigalloc, trim the requested size to the nearest cluster * boundary to avoid creating an unusable filesystem. We do this * silently, instead of returning an error, to avoid breaking * callers that blindly resize the filesystem to the full size of * the underlying block device. */ if (ext4_has_feature_bigalloc(sb)) n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1); retry: o_blocks_count = ext4_blocks_count(es); ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ ext4_warning(sb, "can't shrink FS - resize aborted"); return -EINVAL; } if (n_blocks_count == o_blocks_count) /* Nothing need to do */ return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); meta_bg = ext4_has_feature_meta_bg(sb); if (ext4_has_feature_resize_inode(sb)) { if (meta_bg) { ext4_error(sb, "resize_inode and meta_bg enabled " "simultaneously"); return -EINVAL; } if (n_desc_blocks > o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks)) { n_blocks_count_retry = n_blocks_count; n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } if (!resize_inode) resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); } } if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } if (n_blocks_count_retry) { n_blocks_count = n_blocks_count_retry; n_blocks_count_retry = 0; goto retry; } } /* * Make sure the last group has enough space so that it's * guaranteed to have enough space for all metadata blocks * that it might need to hold. (We might not need to store * the inode table blocks in the last block group, but there * will be cases where this might be needed.) */ if ((ext4_group_first_block_no(sb, n_group) + ext4_group_overhead_blocks(sb, n_group) + 2 + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { n_blocks_count = ext4_group_first_block_no(sb, n_group); n_group--; n_blocks_count_retry = 0; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } goto retry; } /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; else add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; } if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) goto out; err = ext4_alloc_flex_bg_array(sb, n_group + 1); if (err) goto out; err = ext4_mb_alloc_groupinfo(sb, n_group + 1); if (err) goto out; flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; } /* Add flex groups. Note that a regular group is a * flex group with 1 group. */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { if (time_is_before_jiffies(last_update_time + HZ * 10)) { if (last_update_time) ext4_msg(sb, KERN_INFO, "resized to %llu blocks", ext4_blocks_count(es)); last_update_time = jiffies; } if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); if (unlikely(err)) break; } if (!err && n_blocks_count_retry) { n_blocks_count = n_blocks_count_retry; n_blocks_count_retry = 0; free_flex_gd(flex_gd); flex_gd = NULL; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } goto retry; } out: if (flex_gd) free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); if (err) ext4_warning(sb, "error (%d) occurred during " "file system resize", err); ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", ext4_blocks_count(es)); return err; }
5 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ #include <linux/sysctl.h> #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/tcp.h> #include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/ping.h> #include <net/protocol.h> #include <net/netevent.h> static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT; static unsigned long ip_ping_group_range_min[] = { 0, 0 }; static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, unsigned int low, unsigned int high) { bool same_parity = !((low ^ high) & 1); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low); } /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int ret; int range[2]; struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, .extra1 = &ip_local_port_range_min, .extra2 = &ip_local_port_range_max, }; inet_get_local_port_range(net, &range[0], &range[1]); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { /* Ensure that the upper limit is not smaller than the lower, * and that the lower does not encroach upon the privileged * port limit. */ if ((range[1] < range[0]) || (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else set_local_port_range(net, range[0], range[1]); } return ret; } /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); int ret; int pports; int range[2]; struct ctl_table tmp = { .data = &pports, .maxlen = sizeof(pports), .mode = table->mode, .extra1 = &ip_privileged_port_min, .extra2 = &ip_privileged_port_max, }; pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { inet_get_local_port_range(net, &range[0], &range[1]); /* Ensure that the local port range doesn't overlap with the * privileged port range. */ if (range[0] < pports) ret = -EINVAL; else WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports); } return ret; } static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } /* Update system visible IP port range */ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) { kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); write_seqlock(&net->ipv4.ping_group_range.lock); data[0] = low; data[1] = high; write_sequnlock(&net->ipv4.ping_group_range.lock); } /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; unsigned long urange[2]; kgid_t low, high; struct ctl_table tmp = { .data = &urange, .maxlen = sizeof(urange), .mode = table->mode, .extra1 = &ip_ping_group_range_min, .extra2 = &ip_ping_group_range_max, }; inet_get_ping_group_range_table(table, &low, &high); urange[0] = from_kgid_munged(user_ns, low); urange[1] = from_kgid_munged(user_ns, high); ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); if (!gid_valid(low) || !gid_valid(high)) return -EINVAL; if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } set_ping_group_range(table, low, high); } return ret; } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); return ret; } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, .maxlen = TCP_CA_NAME_MAX, }; int ret; tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(net, val); return ret; } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); return ret; } static int sscanf_key(char *buf, __le32 *key) { u32 user_key[4]; int i, ret = 0; if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1, user_key + 2, user_key + 3) != 4) { ret = -EINVAL; } else { for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); } pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", user_key[0], user_key[1], user_key[2], user_key[3], buf, ret); return ret; } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)) }; u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)]; __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)]; char *backup_data; int ret, i = 0, off = 0, n_keys; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key); if (!n_keys) { memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); n_keys = 1; } for (i = 0; i < n_keys * 4; i++) user_key[i] = le32_to_cpu(key[i]); for (i = 0; i < n_keys; i++) { off += snprintf(tbl.data + off, tbl.maxlen - off, "%08x-%08x-%08x-%08x", user_key[i * 4], user_key[i * 4 + 1], user_key[i * 4 + 2], user_key[i * 4 + 3]); if (WARN_ON_ONCE(off >= tbl.maxlen - 1)) break; if (i + 1 < n_keys) off += snprintf(tbl.data + off, tbl.maxlen - off, ","); } ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { backup_data = strchr(tbl.data, ','); if (backup_data) { *backup_data = '\0'; backup_data++; } if (sscanf_key(tbl.data, key)) { ret = -EINVAL; goto bad_key; } if (backup_data) { if (sscanf_key(backup_data, key + 4)) { ret = -EINVAL; goto bad_key; } } tcp_fastopen_reset_cipher(net, NULL, key, backup_data ? key + 4 : NULL); } bad_key: kfree(tbl.data); return ret; } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } static int proc_tcp_available_ulp(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_tcp_ehash_entries(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_child_ehash_entries); struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; int tcp_ehash_entries; struct ctl_table tbl; tcp_ehash_entries = hinfo->ehash_mask + 1; /* A negative number indicates that the child netns * shares the global ehash. */ if (!net_eq(net, &init_net) && !hinfo->pernet) tcp_ehash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &tcp_ehash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } static int proc_udp_hash_entries(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_udp_child_hash_entries); int udp_hash_entries; struct ctl_table tbl; udp_hash_entries = net->ipv4.udp_table->mask + 1; /* A negative number indicates that the child netns * shares the global udp_table. */ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table) udp_hash_entries *= -1; memset(&tbl, 0, sizeof(tbl)); tbl.data = &udp_hash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_policy); int ret; ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv4.sysctl_fib_multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } #endif static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "inet_peer_minttl", .data = &inet_peer_minttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "inet_peer_maxttl", .data = &inet_peer_maxttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", .data = &cipso_v4_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_cache_bucket_size", .data = &cipso_v4_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_optfmt", .data = &cipso_v4_rbm_optfmt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cipso_rbm_strictvalid", .data = &cipso_v4_rbm_strictvalid, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_ulp, }, { .procname = "icmp_msgs_per_sec", .data = &sysctl_icmp_msgs_per_sec, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "icmp_msgs_burst", .data = &sysctl_icmp_msgs_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "fib_sync_mem", .data = &sysctl_fib_sync_mem, .maxlen = sizeof(sysctl_fib_sync_mem), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, }; static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_max_tw_buckets", .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_enable_probe", .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "icmp_ratemask", .data = &init_net.ipv4.sysctl_icmp_ratemask, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, { .procname = "ip_local_port_range", .maxlen = 0, .data = &init_net, .mode = 0644, .proc_handler = ipv4_local_port_range, }, { .procname = "ip_local_reserved_ports", .data = &init_net.ipv4.sysctl_local_reserved_ports, .maxlen = 65536, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", .data = &init_net.ipv4.sysctl_tcp_base_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_min_snd_mss", .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_mtu_probe_floor", .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "igmp_max_msf", .data = &init_net.ipv4.sysctl_igmp_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #ifdef CONFIG_IP_MULTICAST { .procname = "igmp_qrv", .data = &init_net.ipv4.sysctl_igmp_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #endif { .procname = "tcp_congestion_control", .data = &init_net.ipv4.tcp_congestion_control, .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, { .procname = "tcp_available_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0444, .proc_handler = proc_tcp_available_congestion_control, }, { .procname = "tcp_allowed_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, #endif { .procname = "tcp_migrate_req", .data = &init_net.ipv4.sysctl_tcp_migrate_req, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", .data = &init_net.ipv4.sysctl_tcp_fin_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "tcp_notsent_lowat", .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_fastopen", .data = &init_net.ipv4.sysctl_tcp_fastopen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tcp_fastopen_key", .mode = 0600, .data = &init_net.ipv4.sysctl_tcp_fastopen, /* maxlen to print the list of keys in hex (*2), with dashes * separating doublewords and a comma in between keys. */ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)), .proc_handler = proc_tcp_fastopen_key, }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_tfo_blackhole_detect_timeout, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_fib_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, #endif { .procname = "ip_unprivileged_port_start", .maxlen = sizeof(int), .data = &init_net.ipv4.sysctl_ip_prot_sock, .mode = 0644, .proc_handler = ipv4_privileged_ports, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", .data = &init_net.ipv4.sysctl_tcp_max_reordering, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_app_win_max, }, { .procname = "tcp_adv_win_scale", .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &tcp_adv_win_scale_min, .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_challenge_ack_limit", .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_tso_rtt_log", .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_min_rtt_wlen", .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_invalid_ratelimit", .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "tcp_pacing_ss_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_pacing_ca_ratio", .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "tcp_wmem", .data = &init_net.ipv4.sysctl_tcp_wmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_rmem", .data = &init_net.ipv4.sysctl_tcp_rmem, .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "tcp_comp_sack_delay_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_slack_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "tcp_backlog_ack_defer", .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .mode = 0444, .proc_handler = proc_tcp_ehash_entries, }, { .procname = "tcp_child_ehash_entries", .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_child_ehash_entries_max, }, { .procname = "udp_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .mode = 0444, .proc_handler = proc_udp_hash_entries, }, { .procname = "udp_child_hash_entries", .data = &init_net.ipv4.sysctl_udp_child_hash_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &udp_child_hash_entries_max, }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "udp_wmem_min", .data = &init_net.ipv4.sysctl_udp_wmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "tcp_plb_enabled", .data = &init_net.ipv4.sysctl_tcp_plb_enabled, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_plb_idle_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_rehash_rounds", .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_plb_max_rounds, }, { .procname = "tcp_plb_suspend_rto_sec", .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_plb_cong_thresh", .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_plb_max_cong_thresh, }, { .procname = "tcp_syn_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &tcp_syn_linear_timeouts_max, }, { .procname = "tcp_shrink_window", .data = &init_net.ipv4.sysctl_tcp_shrink_window, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_pingpong_thresh", .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) { size_t table_size = ARRAY_SIZE(ipv4_net_table); struct ctl_table *table; table = ipv4_net_table; if (!net_eq(net, &init_net)) { int i; table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (!table) goto err_alloc; for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net */ table[i].data += (void *)net - (void *)&init_net; } else { /* Entries without data pointer are global; * Make them read-only in non-init_net ns */ table[i].mode &= ~0222; } } } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; return 0; err_ports: unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static __net_exit void ipv4_sysctl_exit_net(struct net *net) { const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); } static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, }; static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (!hdr) return -ENOMEM; if (register_pernet_subsys(&ipv4_sysctl_ops)) { unregister_net_sysctl_table(hdr); return -ENOMEM; } return 0; } __initcall(sysctl_ipv4_init);
1 12 13 9 5 9 2 35 32 30 2 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha256_base.h - core logic for SHA-256 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA256_BASE_H #define _CRYPTO_SHA256_BASE_H #include <asm/byteorder.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/string.h> #include <linux/types.h> typedef void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks); static inline int sha224_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha224_init(sctx); return 0; } static inline int sha256_base_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); sha256_init(sctx); return 0; } static inline int lib_sha256_base_do_update(struct sha256_state *sctx, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SHA256_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA256_BLOCK_SIZE - partial; memcpy(sctx->buf + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buf, 1); } blocks = len / SHA256_BLOCK_SIZE; len %= SHA256_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA256_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buf + partial, data, len); return 0; } static inline int sha256_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_update(sctx, data, len, block_fn); } static inline int lib_sha256_base_do_finalize(struct sha256_state *sctx, sha256_block_fn *block_fn) { const int bit_offset = SHA256_BLOCK_SIZE - sizeof(__be64); __be64 *bits = (__be64 *)(sctx->buf + bit_offset); unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; sctx->buf[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buf + partial, 0x0, SHA256_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buf, 1); } memset(sctx->buf + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buf, 1); return 0; } static inline int sha256_base_do_finalize(struct shash_desc *desc, sha256_block_fn *block_fn) { struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_do_finalize(sctx, block_fn); } static inline int lib_sha256_base_finish(struct sha256_state *sctx, u8 *out, unsigned int digest_size) { __be32 *digest = (__be32 *)out; int i; for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be32)) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } static inline int sha256_base_finish(struct shash_desc *desc, u8 *out) { unsigned int digest_size = crypto_shash_digestsize(desc->tfm); struct sha256_state *sctx = shash_desc_ctx(desc); return lib_sha256_base_finish(sctx, out, digest_size); } #endif /* _CRYPTO_SHA256_BASE_H */
2 2 2 2 1 2 3 3 4 3 3 2 1 6 6 6 4 4 4 4 4 4 3 1 2 4 4 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fdtable.h> #include <linux/string.h> #include <linux/random.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/cache.h> #include <linux/bug.h> #include <linux/err.h> #include <linux/kcmp.h> #include <linux/capability.h> #include <linux/list.h> #include <linux/eventpoll.h> #include <linux/file.h> #include <asm/unistd.h> /* * We don't expose the real in-memory order of objects for security reasons. * But still the comparison results should be suitable for sorting. So we * obfuscate kernel pointers values and compare the production instead. * * The obfuscation is done in two steps. First we xor the kernel pointer with * a random value, which puts pointer into a new position in a reordered space. * Secondly we multiply the xor production with a large odd random number to * permute its bits even more (the odd multiplier guarantees that the product * is unique ever after the high bits are truncated, since any odd number is * relative prime to 2^n). * * Note also that the obfuscation itself is invisible to userspace and if needed * it can be changed to an alternate scheme. */ static unsigned long cookies[KCMP_TYPES][2] __read_mostly; static long kptr_obfuscate(long v, int type) { return (v ^ cookies[type][0]) * cookies[type][1]; } /* * 0 - equal, i.e. v1 = v2 * 1 - less than, i.e. v1 < v2 * 2 - greater than, i.e. v1 > v2 * 3 - not equal but ordering unavailable (reserved for future) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { long t1, t2; t1 = kptr_obfuscate((long)v1, type); t2 = kptr_obfuscate((long)v2, type); return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ static struct file * get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; rcu_read_lock(); file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); if (file) fput(file); return file; } static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { if (likely(l2 != l1)) up_read(l2); up_read(l1); } static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; if (l2 > l1) swap(l1, l2); err = down_read_killable(l1); if (!err && likely(l1 != l2)) { err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) up_read(l1); } return err; } #ifdef CONFIG_EPOLL static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { struct file *filp, *filp_epoll, *filp_tgt; struct kcmp_epoll_slot slot; if (copy_from_user(&slot, uslot, sizeof(slot))) return -EFAULT; filp = get_file_raw_ptr(task1, idx1); if (!filp) return -EBADF; filp_epoll = fget_task(task2, slot.efd); if (!filp_epoll) return -EBADF; filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); return kcmp_ptr(filp, filp_tgt, KCMP_FILE); } #else static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { return -EOPNOTSUPP; } #endif SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { struct task_struct *task1, *task2; int ret; rcu_read_lock(); /* * Tasks are looked up in caller's PID namespace only. */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); if (!task1 || !task2) goto err_no_task; get_task_struct(task1); get_task_struct(task2); rcu_read_unlock(); /* * One should have enough rights to inspect task details. */ ret = kcmp_lock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } switch (type) { case KCMP_FILE: { struct file *filp1, *filp2; filp1 = get_file_raw_ptr(task1, idx1); filp2 = get_file_raw_ptr(task2, idx2); if (filp1 && filp2) ret = kcmp_ptr(filp1, filp2, KCMP_FILE); else ret = -EBADF; break; } case KCMP_VM: ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); break; case KCMP_FILES: ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); break; case KCMP_FS: ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); break; case KCMP_SIGHAND: ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); break; case KCMP_IO: ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); break; case KCMP_SYSVSEM: #ifdef CONFIG_SYSVIPC ret = kcmp_ptr(task1->sysvsem.undo_list, task2->sysvsem.undo_list, KCMP_SYSVSEM); #else ret = -EOPNOTSUPP; #endif break; case KCMP_EPOLL_TFD: ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); break; default: ret = -EINVAL; break; } err_unlock: kcmp_unlock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); return ret; err_no_task: rcu_read_unlock(); return -ESRCH; } static __init int kcmp_cookies_init(void) { int i; get_random_bytes(cookies, sizeof(cookies)); for (i = 0; i < KCMP_TYPES; i++) cookies[i][1] |= (~(~0UL >> 1) | 1); return 0; } arch_initcall(kcmp_cookies_init);
9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/xattr.h> #include <linux/evm.h> int posix_xattr_acl(const char *xattr) { int xattr_len = strlen(xattr); if ((strlen(XATTR_NAME_POSIX_ACL_ACCESS) == xattr_len) && (strncmp(XATTR_NAME_POSIX_ACL_ACCESS, xattr, xattr_len) == 0)) return 1; if ((strlen(XATTR_NAME_POSIX_ACL_DEFAULT) == xattr_len) && (strncmp(XATTR_NAME_POSIX_ACL_DEFAULT, xattr, xattr_len) == 0)) return 1; return 0; }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Definitions for key type implementations * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_KEY_TYPE_H #define _LINUX_KEY_TYPE_H #include <linux/key.h> #include <linux/errno.h> #ifdef CONFIG_KEYS struct kernel_pkey_query; struct kernel_pkey_params; /* * Pre-parsed payload, used by key add, update and instantiate. * * This struct will be cleared and data and datalen will be set with the data * and length parameters from the caller and quotalen will be set from * def_datalen from the key type. Then if the preparse() op is provided by the * key type, that will be called. Then the struct will be passed to the * instantiate() or the update() op. * * If the preparse() op is given, the free_preparse() op will be called to * clear the contents. */ struct key_preparsed_payload { const char *orig_description; /* Actual or proposed description (maybe NULL) */ char *description; /* Proposed key description (or NULL) */ union key_payload payload; /* Proposed payload */ const void *data; /* Raw data */ size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time64_t expiry; /* Expiry time of key */ } __randomize_layout; typedef int (*request_key_actor_t)(struct key *auth_key, void *aux); /* * Preparsed matching criterion. */ struct key_match_data { /* Comparison function, defaults to exact description match, but can be * overridden by type->match_preparse(). Should return true if a match * is found and false if not. */ bool (*cmp)(const struct key *key, const struct key_match_data *match_data); const void *raw_data; /* Raw match data */ void *preparsed; /* For ->match_preparse() to stash stuff */ unsigned lookup_type; /* Type of lookup for this search. */ #define KEYRING_SEARCH_LOOKUP_DIRECT 0x0000 /* Direct lookup by description. */ #define KEYRING_SEARCH_LOOKUP_ITERATE 0x0001 /* Iterative search. */ }; /* * kernel managed key type definition */ struct key_type { /* name of the type */ const char *name; /* default payload length for quota precalculation (optional) * - this can be used instead of calling key_payload_reserve(), that * function only needs to be called if the real datalen is different */ size_t def_datalen; unsigned int flags; #define KEY_TYPE_NET_DOMAIN 0x00000001 /* Keys of this type have a net namespace domain */ #define KEY_TYPE_INSTANT_REAP 0x00000002 /* Keys of this type don't have a delay after expiring */ /* vet a description */ int (*vet_description)(const char *description); /* Preparse the data blob from userspace that is to be the payload, * generating a proposed description and payload that will be handed to * the instantiate() and update() ops. */ int (*preparse)(struct key_preparsed_payload *prep); /* Free a preparse data structure. */ void (*free_preparse)(struct key_preparsed_payload *prep); /* instantiate a key of this type * - this method should call key_payload_reserve() to determine if the * user's quota will hold the payload */ int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); /* update a key of this type (optional) * - this method should call key_payload_reserve() to recalculate the * quota consumption * - the key must be locked against read when modifying */ int (*update)(struct key *key, struct key_preparsed_payload *prep); /* Preparse the data supplied to ->match() (optional). The * data to be preparsed can be found in match_data->raw_data. * The lookup type can also be set by this function. */ int (*match_preparse)(struct key_match_data *match_data); /* Free preparsed match data (optional). This should be supplied it * ->match_preparse() is supplied. */ void (*match_free)(struct key_match_data *match_data); /* clear some of the data from a key on revokation (optional) * - the key's semaphore will be write-locked by the caller */ void (*revoke)(struct key *key); /* clear the data from a key (optional) */ void (*destroy)(struct key *key); /* describe a key */ void (*describe)(const struct key *key, struct seq_file *p); /* read a key's data (optional) * - permission checks will be done by the caller * - the key's semaphore will be readlocked by the caller * - should return the amount of data that could be read, no matter how * much is copied into the buffer * - shouldn't do the copy if the buffer is NULL */ long (*read)(const struct key *key, char *buffer, size_t buflen); /* handle request_key() for this type instead of invoking * /sbin/request-key (optional) * - key is the key to instantiate * - authkey is the authority to assume when instantiating this key * - op is the operation to be done, usually "create" * - the call must not return until the instantiation process has run * its course */ request_key_actor_t request_key; /* Look up a keyring access restriction (optional) * * - NULL is a valid return value (meaning the requested restriction * is known but will never block addition of a key) * - should return -EINVAL if the restriction is unknown */ struct key_restriction *(*lookup_restriction)(const char *params); /* Asymmetric key accessor functions. */ int (*asym_query)(const struct kernel_pkey_params *params, struct kernel_pkey_query *info); int (*asym_eds_op)(struct kernel_pkey_params *params, const void *in, void *out); int (*asym_verify_signature)(struct kernel_pkey_params *params, const void *in, const void *in2); /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ } __randomize_layout; extern struct key_type key_type_keyring; extern int register_key_type(struct key_type *ktype); extern void unregister_key_type(struct key_type *ktype); extern int key_payload_reserve(struct key *key, size_t datalen); extern int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey); extern int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey); extern void complete_request_key(struct key *authkey, int error); static inline int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, struct key *authkey) { return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey); } extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep); #endif /* CONFIG_KEYS */ #endif /* _LINUX_KEY_TYPE_H */
4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 1 1 3 3 3 3 2 4 5 5 5 5 5 5 5 5 5 2 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0-or-later #include <crypto/hash.h> #include <linux/cpu.h> #include <linux/kref.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/workqueue.h> #include <net/tcp.h> static size_t __scratch_size; static DEFINE_PER_CPU(void __rcu *, sigpool_scratch); struct sigpool_entry { struct crypto_ahash *hash; const char *alg; struct kref kref; uint16_t needs_key:1, reserved:15; }; #define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry)) static struct sigpool_entry cpool[CPOOL_SIZE]; static unsigned int cpool_populated; static DEFINE_MUTEX(cpool_mutex); /* Slow-path */ struct scratches_to_free { struct rcu_head rcu; unsigned int cnt; void *scratches[]; }; static void free_old_scratches(struct rcu_head *head) { struct scratches_to_free *stf; stf = container_of(head, struct scratches_to_free, rcu); while (stf->cnt--) kfree(stf->scratches[stf->cnt]); kfree(stf); } /** * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path * @size: request size for the scratch/temp buffer */ static int sigpool_reserve_scratch(size_t size) { struct scratches_to_free *stf; size_t stf_sz = struct_size(stf, scratches, num_possible_cpus()); int cpu, err = 0; lockdep_assert_held(&cpool_mutex); if (__scratch_size >= size) return 0; stf = kmalloc(stf_sz, GFP_KERNEL); if (!stf) return -ENOMEM; stf->cnt = 0; size = max(size, __scratch_size); cpus_read_lock(); for_each_possible_cpu(cpu) { void *scratch, *old_scratch; scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!scratch) { err = -ENOMEM; break; } old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); continue; } stf->scratches[stf->cnt++] = old_scratch; } cpus_read_unlock(); if (!err) __scratch_size = size; call_rcu(&stf->rcu, free_old_scratches); return err; } static void sigpool_scratch_free(void) { int cpu; for_each_possible_cpu(cpu) kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } static int __cpool_try_clone(struct crypto_ahash *hash) { struct crypto_ahash *tmp; tmp = crypto_clone_ahash(hash); if (IS_ERR(tmp)) return PTR_ERR(tmp); crypto_free_ahash(tmp); return 0; } static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg) { struct crypto_ahash *cpu0_hash; int ret; e->alg = kstrdup(alg, GFP_KERNEL); if (!e->alg) return -ENOMEM; cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(cpu0_hash)) { ret = PTR_ERR(cpu0_hash); goto out_free_alg; } e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY; ret = __cpool_try_clone(cpu0_hash); if (ret) goto out_free_cpu0_hash; e->hash = cpu0_hash; kref_init(&e->kref); return 0; out_free_cpu0_hash: crypto_free_ahash(cpu0_hash); out_free_alg: kfree(e->alg); e->alg = NULL; return ret; } /** * tcp_sigpool_alloc_ahash - allocates pool for ahash requests * @alg: name of async hash algorithm * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size */ int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size) { int i, ret; /* slow-path */ mutex_lock(&cpool_mutex); ret = sigpool_reserve_scratch(scratch_size); if (ret) goto out; for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) continue; if (strcmp(cpool[i].alg, alg)) continue; /* pairs with tcp_sigpool_release() */ if (!kref_get_unless_zero(&cpool[i].kref)) kref_init(&cpool[i].kref); ret = i; goto out; } for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) break; } if (i >= CPOOL_SIZE) { ret = -ENOSPC; goto out; } ret = __cpool_alloc_ahash(&cpool[i], alg); if (!ret) { ret = i; if (i == cpool_populated) cpool_populated++; } out: mutex_unlock(&cpool_mutex); return ret; } EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash); static void __cpool_free_entry(struct sigpool_entry *e) { crypto_free_ahash(e->hash); kfree(e->alg); memset(e, 0, sizeof(*e)); } static void cpool_cleanup_work_cb(struct work_struct *work) { bool free_scratch = true; unsigned int i; mutex_lock(&cpool_mutex); for (i = 0; i < cpool_populated; i++) { if (kref_read(&cpool[i].kref) > 0) { free_scratch = false; continue; } if (!cpool[i].alg) continue; __cpool_free_entry(&cpool[i]); } if (free_scratch) sigpool_scratch_free(); mutex_unlock(&cpool_mutex); } static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb); static void cpool_schedule_cleanup(struct kref *kref) { schedule_work(&cpool_cleanup_work); } /** * tcp_sigpool_release - decreases number of users for a pool. If it was * the last user of the pool, releases any memory that was consumed. * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_release(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; /* slow-path */ kref_put(&cpool[id].kref, cpool_schedule_cleanup); } EXPORT_SYMBOL_GPL(tcp_sigpool_release); /** * tcp_sigpool_get - increases number of users (refcounter) for a pool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_get(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; kref_get(&cpool[id].kref); } EXPORT_SYMBOL_GPL(tcp_sigpool_get); int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH) { struct crypto_ahash *hash; rcu_read_lock_bh(); if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) { rcu_read_unlock_bh(); return -EINVAL; } hash = crypto_clone_ahash(cpool[id].hash); if (IS_ERR(hash)) { rcu_read_unlock_bh(); return PTR_ERR(hash); } c->req = ahash_request_alloc(hash, GFP_ATOMIC); if (!c->req) { crypto_free_ahash(hash); rcu_read_unlock_bh(); return -ENOMEM; } ahash_request_set_callback(c->req, 0, NULL, NULL); /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); } EXPORT_SYMBOL_GPL(tcp_sigpool_end); /** * tcp_sigpool_algo - return algorithm of tcp_sigpool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @buf: buffer to return name of algorithm * @buf_len: size of @buf */ size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return -EINVAL; return strscpy(buf, cpool[id].alg, buf_len); } EXPORT_SYMBOL_GPL(tcp_sigpool_algo); /** * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool * @hp: tcp_sigpool pointer * @skb: buffer to add sign for * @header_len: TCP header length for this segment */ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, const struct sk_buff *skb, unsigned int header_len) { const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); const struct tcphdr *tp = tcp_hdr(skb); struct ahash_request *req = hp->req; struct sk_buff *frag_iter; struct scatterlist sg; unsigned int i; sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len); ahash_request_set_crypt(req, &sg, NULL, head_data_len); if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; unsigned int offset = skb_frag_off(f); struct page *page; page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); if (crypto_ahash_update(req)) return 1; } skb_walk_frags(skb, frag_iter) if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0)) return 1; return 0; } EXPORT_SYMBOL(tcp_sigpool_hash_skb_data); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
181 181 178 180 150 5 32 180 1 1 1 2669 2701 2666 2670 2670 2674 2679 2669 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @func: identifies the work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
2 2 2 2 2 2 2 2 2 1 4 4 4 4 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include <linux/ctype.h> #include <linux/net.h> #include <linux/if_vlan.h> #include <linux/if_ether.h> #include <linux/inetdevice.h> #include <net/udp_tunnel.h> #include <net/ipv6.h> static int send4(struct wg_device *wg, struct sk_buff *skb, struct endpoint *endpoint, u8 ds, struct dst_cache *cache) { struct flowi4 fl = { .saddr = endpoint->src4.s_addr, .daddr = endpoint->addr4.sin_addr.s_addr, .fl4_dport = endpoint->addr4.sin_port, .flowi4_mark = wg->fwmark, .flowi4_proto = IPPROTO_UDP }; struct rtable *rt = NULL; struct sock *sock; int ret = 0; skb_mark_not_on_list(skb); skb->dev = wg->dev; skb->mark = wg->fwmark; rcu_read_lock_bh(); sock = rcu_dereference_bh(wg->sock4); if (unlikely(!sock)) { ret = -ENONET; goto err; } fl.fl4_sport = inet_sk(sock)->inet_sport; if (cache) rt = dst_cache_get_ip4(cache, &fl.saddr); if (!rt) { security_sk_classify_flow(sock, flowi4_to_flowi_common(&fl)); if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0, fl.saddr, RT_SCOPE_HOST))) { endpoint->src4.s_addr = 0; endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); } rt = ip_route_output_flow(sock_net(sock), &fl, sock); if (unlikely(endpoint->src_if4 && ((IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) && rt->dst.dev->ifindex != endpoint->src_if4)))) { endpoint->src4.s_addr = 0; endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); if (!IS_ERR(rt)) ip_rt_put(rt); rt = ip_route_output_flow(sock_net(sock), &fl, sock); } if (IS_ERR(rt)) { ret = PTR_ERR(rt); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; } if (cache) dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } skb->ignore_df = 1; udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, false); goto out; err: kfree_skb(skb); out: rcu_read_unlock_bh(); return ret; } static int send6(struct wg_device *wg, struct sk_buff *skb, struct endpoint *endpoint, u8 ds, struct dst_cache *cache) { #if IS_ENABLED(CONFIG_IPV6) struct flowi6 fl = { .saddr = endpoint->src6, .daddr = endpoint->addr6.sin6_addr, .fl6_dport = endpoint->addr6.sin6_port, .flowi6_mark = wg->fwmark, .flowi6_oif = endpoint->addr6.sin6_scope_id, .flowi6_proto = IPPROTO_UDP /* TODO: addr->sin6_flowinfo */ }; struct dst_entry *dst = NULL; struct sock *sock; int ret = 0; skb_mark_not_on_list(skb); skb->dev = wg->dev; skb->mark = wg->fwmark; rcu_read_lock_bh(); sock = rcu_dereference_bh(wg->sock6); if (unlikely(!sock)) { ret = -ENONET; goto err; } fl.fl6_sport = inet_sk(sock)->inet_sport; if (cache) dst = dst_cache_get_ip6(cache, &fl.saddr); if (!dst) { security_sk_classify_flow(sock, flowi6_to_flowi_common(&fl)); if (unlikely(!ipv6_addr_any(&fl.saddr) && !ipv6_chk_addr(sock_net(sock), &fl.saddr, NULL, 0))) { endpoint->src6 = fl.saddr = in6addr_any; if (cache) dst_cache_reset(cache); } dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl, NULL); if (IS_ERR(dst)) { ret = PTR_ERR(dst); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; } if (cache) dst_cache_set_ip6(cache, dst, &fl.saddr); } skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, false); goto out; err: kfree_skb(skb); out: rcu_read_unlock_bh(); return ret; #else kfree_skb(skb); return -EAFNOSUPPORT; #endif } int wg_socket_send_skb_to_peer(struct wg_peer *peer, struct sk_buff *skb, u8 ds) { size_t skb_len = skb->len; int ret = -EAFNOSUPPORT; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) ret = send4(peer->device, skb, &peer->endpoint, ds, &peer->endpoint_cache); else if (peer->endpoint.addr.sa_family == AF_INET6) ret = send6(peer->device, skb, &peer->endpoint, ds, &peer->endpoint_cache); else dev_kfree_skb(skb); if (likely(!ret)) peer->tx_bytes += skb_len; read_unlock_bh(&peer->endpoint_lock); return ret; } int wg_socket_send_buffer_to_peer(struct wg_peer *peer, void *buffer, size_t len, u8 ds) { struct sk_buff *skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC); if (unlikely(!skb)) return -ENOMEM; skb_reserve(skb, SKB_HEADER_LEN); skb_set_inner_network_header(skb, 0); skb_put_data(skb, buffer, len); return wg_socket_send_skb_to_peer(peer, skb, ds); } int wg_socket_send_buffer_as_reply_to_skb(struct wg_device *wg, struct sk_buff *in_skb, void *buffer, size_t len) { int ret = 0; struct sk_buff *skb; struct endpoint endpoint; if (unlikely(!in_skb)) return -EINVAL; ret = wg_socket_endpoint_from_skb(&endpoint, in_skb); if (unlikely(ret < 0)) return ret; skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC); if (unlikely(!skb)) return -ENOMEM; skb_reserve(skb, SKB_HEADER_LEN); skb_set_inner_network_header(skb, 0); skb_put_data(skb, buffer, len); if (endpoint.addr.sa_family == AF_INET) ret = send4(wg, skb, &endpoint, 0, NULL); else if (endpoint.addr.sa_family == AF_INET6) ret = send6(wg, skb, &endpoint, 0, NULL); /* No other possibilities if the endpoint is valid, which it is, * as we checked above. */ return ret; } int wg_socket_endpoint_from_skb(struct endpoint *endpoint, const struct sk_buff *skb) { memset(endpoint, 0, sizeof(*endpoint)); if (skb->protocol == htons(ETH_P_IP)) { endpoint->addr4.sin_family = AF_INET; endpoint->addr4.sin_port = udp_hdr(skb)->source; endpoint->addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; endpoint->src4.s_addr = ip_hdr(skb)->daddr; endpoint->src_if4 = skb->skb_iif; } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { endpoint->addr6.sin6_family = AF_INET6; endpoint->addr6.sin6_port = udp_hdr(skb)->source; endpoint->addr6.sin6_addr = ipv6_hdr(skb)->saddr; endpoint->addr6.sin6_scope_id = ipv6_iface_scope_id( &ipv6_hdr(skb)->saddr, skb->skb_iif); endpoint->src6 = ipv6_hdr(skb)->daddr; } else { return -EINVAL; } return 0; } static bool endpoint_eq(const struct endpoint *a, const struct endpoint *b) { return (a->addr.sa_family == AF_INET && b->addr.sa_family == AF_INET && a->addr4.sin_port == b->addr4.sin_port && a->addr4.sin_addr.s_addr == b->addr4.sin_addr.s_addr && a->src4.s_addr == b->src4.s_addr && a->src_if4 == b->src_if4) || (a->addr.sa_family == AF_INET6 && b->addr.sa_family == AF_INET6 && a->addr6.sin6_port == b->addr6.sin6_port && ipv6_addr_equal(&a->addr6.sin6_addr, &b->addr6.sin6_addr) && a->addr6.sin6_scope_id == b->addr6.sin6_scope_id && ipv6_addr_equal(&a->src6, &b->src6)) || unlikely(!a->addr.sa_family && !b->addr.sa_family); } void wg_socket_set_peer_endpoint(struct wg_peer *peer, const struct endpoint *endpoint) { /* First we check unlocked, in order to optimize, since it's pretty rare * that an endpoint will change. If we happen to be mid-write, and two * CPUs wind up writing the same thing or something slightly different, * it doesn't really matter much either. */ if (endpoint_eq(endpoint, &peer->endpoint)) return; write_lock_bh(&peer->endpoint_lock); if (endpoint->addr.sa_family == AF_INET) { peer->endpoint.addr4 = endpoint->addr4; peer->endpoint.src4 = endpoint->src4; peer->endpoint.src_if4 = endpoint->src_if4; } else if (IS_ENABLED(CONFIG_IPV6) && endpoint->addr.sa_family == AF_INET6) { peer->endpoint.addr6 = endpoint->addr6; peer->endpoint.src6 = endpoint->src6; } else { goto out; } dst_cache_reset(&peer->endpoint_cache); out: write_unlock_bh(&peer->endpoint_lock); } void wg_socket_set_peer_endpoint_from_skb(struct wg_peer *peer, const struct sk_buff *skb) { struct endpoint endpoint; if (!wg_socket_endpoint_from_skb(&endpoint, skb)) wg_socket_set_peer_endpoint(peer, &endpoint); } void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer) { write_lock_bh(&peer->endpoint_lock); memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6)); dst_cache_reset_now(&peer->endpoint_cache); write_unlock_bh(&peer->endpoint_lock); } static int wg_receive(struct sock *sk, struct sk_buff *skb) { struct wg_device *wg; if (unlikely(!sk)) goto err; wg = sk->sk_user_data; if (unlikely(!wg)) goto err; skb_mark_not_on_list(skb); wg_packet_receive(wg, skb); return 0; err: kfree_skb(skb); return 0; } static void sock_free(struct sock *sock) { if (unlikely(!sock)) return; sk_clear_memalloc(sock); udp_tunnel_sock_release(sock->sk_socket); } static void set_sock_opts(struct socket *sock) { sock->sk->sk_allocation = GFP_ATOMIC; sock->sk->sk_sndbuf = INT_MAX; sk_set_memalloc(sock->sk); } int wg_socket_init(struct wg_device *wg, u16 port) { struct net *net; int ret; struct udp_tunnel_sock_cfg cfg = { .sk_user_data = wg, .encap_type = 1, .encap_rcv = wg_receive }; struct socket *new4 = NULL, *new6 = NULL; struct udp_port_cfg port4 = { .family = AF_INET, .local_ip.s_addr = htonl(INADDR_ANY), .local_udp_port = htons(port), .use_udp_checksums = true }; #if IS_ENABLED(CONFIG_IPV6) int retries = 0; struct udp_port_cfg port6 = { .family = AF_INET6, .local_ip6 = IN6ADDR_ANY_INIT, .use_udp6_tx_checksums = true, .use_udp6_rx_checksums = true, .ipv6_v6only = true }; #endif rcu_read_lock(); net = rcu_dereference(wg->creating_net); net = net ? maybe_get_net(net) : NULL; rcu_read_unlock(); if (unlikely(!net)) return -ENONET; #if IS_ENABLED(CONFIG_IPV6) retry: #endif ret = udp_sock_create(net, &port4, &new4); if (ret < 0) { pr_err("%s: Could not create IPv4 socket\n", wg->dev->name); goto out; } set_sock_opts(new4); setup_udp_tunnel_sock(net, new4, &cfg); #if IS_ENABLED(CONFIG_IPV6) if (ipv6_mod_enabled()) { port6.local_udp_port = inet_sk(new4->sk)->inet_sport; ret = udp_sock_create(net, &port6, &new6); if (ret < 0) { udp_tunnel_sock_release(new4); if (ret == -EADDRINUSE && !port && retries++ < 100) goto retry; pr_err("%s: Could not create IPv6 socket\n", wg->dev->name); goto out; } set_sock_opts(new6); setup_udp_tunnel_sock(net, new6, &cfg); } #endif wg_socket_reinit(wg, new4->sk, new6 ? new6->sk : NULL); ret = 0; out: put_net(net); return ret; } void wg_socket_reinit(struct wg_device *wg, struct sock *new4, struct sock *new6) { struct sock *old4, *old6; mutex_lock(&wg->socket_update_lock); old4 = rcu_dereference_protected(wg->sock4, lockdep_is_held(&wg->socket_update_lock)); old6 = rcu_dereference_protected(wg->sock6, lockdep_is_held(&wg->socket_update_lock)); rcu_assign_pointer(wg->sock4, new4); rcu_assign_pointer(wg->sock6, new6); if (new4) wg->incoming_port = ntohs(inet_sk(new4)->inet_sport); mutex_unlock(&wg->socket_update_lock); synchronize_net(); sock_free(old4); sock_free(old6); }
5 5 5 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8 8 8 8 8 8 8 8 8 5 5 8 5 5 4 4 4 3 1 3 1 1 5 4 2 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 // SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/ipv6.h> #include "../core/sock_destructor.h" /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. * * Invariants: * - next_frag is NULL at the tail of a "run"; * - the head of a "run" has the sum of all fragment lengths in frag_run_len. */ struct ipfrag_skb_cb { union { struct inet_skb_parm h4; struct inet6_skb_parm h6; }; struct sk_buff *next_frag; int frag_run_len; int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) static void fragcb_clear(struct sk_buff *skb) { RB_CLEAR_NODE(&skb->rbnode); FRAG_CB(skb)->next_frag = NULL; FRAG_CB(skb)->frag_run_len = skb->len; } /* Append skb to the last "run". */ static void fragrun_append_to_last(struct inet_frag_queue *q, struct sk_buff *skb) { fragcb_clear(skb); FRAG_CB(q->last_run_head)->frag_run_len += skb->len; FRAG_CB(q->fragments_tail)->next_frag = skb; q->fragments_tail = skb; } /* Create a new "run" with the skb. */ static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); fragcb_clear(skb); if (q->last_run_head) rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, &q->last_run_head->rbnode.rb_right); else rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); rb_insert_color(&skb->rbnode, &q->rb_fragments); q->fragments_tail = skb; q->last_run_head = skb; } /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field */ const u8 ip_frag_ecn_table[16] = { /* at least one fragment had CE, and others ECT_0 or ECT_1 */ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, /* invalid combinations : drop frame */ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, }; EXPORT_SYMBOL(ip_frag_ecn_table); int inet_frags_init(struct inet_frags *f) { f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) return -ENOMEM; refcount_set(&f->refcnt, 1); init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; int count; count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); fq->flags |= INET_FRAG_DROP; if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; count++; } else if (fq->flags & INET_FRAG_HASH_DEAD) { count++; } spin_unlock_bh(&fq->lock); if (refcount_sub_and_test(count, &fq->refcnt)) inet_frag_destroy(fq); } static LLIST_HEAD(fqdir_free_list); static void fqdir_free_fn(struct work_struct *work) { struct llist_node *kill_list; struct fqdir *fqdir, *tmp; struct inet_frags *f; /* Atomically snapshot the list of fqdirs to free */ kill_list = llist_del_all(&fqdir_free_list); /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) * have completed, since they need to dereference fqdir. * Would it not be nice to have kfree_rcu_barrier() ? :) */ rcu_barrier(); llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { f = fqdir->f; if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); kfree(fqdir); } } static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) queue_delayed_work(system_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); int res; if (!fqdir) return -ENOMEM; fqdir->f = f; fqdir->net = net; res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); if (res < 0) { kfree(fqdir); return res; } refcount_inc(&f->refcnt); *fqdirp = fqdir; return 0; } EXPORT_SYMBOL(fqdir_init); static struct workqueue_struct *inet_frag_wq; static int __init inet_frag_wq_init(void) { inet_frag_wq = create_workqueue("inet_frag_wq"); if (!inet_frag_wq) panic("Could not create inet frag workq"); return 0; } pure_initcall(inet_frag_wq_init); void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); queue_work(inet_frag_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; rcu_read_lock(); /* The RCU read lock provides a memory barrier * guaranteeing that if fqdir->dead is false then * the hash table destruction will not start until * after we unlock. Paired with fqdir_pre_exit(). */ if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); refcount_dec(&fq->refcnt); } else { fq->flags |= INET_FRAG_HASH_DEAD; } rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); kmem_cache_free(f->frags_cachep, q); } unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason) { struct rb_node *p = rb_first(root); unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); while (skb) { struct sk_buff *next = FRAG_CB(skb)->next_frag; sum += skb->truesize; kfree_skb_reason(skb, reason); skb = next; } } return sum; } EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { unsigned int sum, sum_truesize = 0; enum skb_drop_reason reason; struct inet_frags *f; struct fqdir *fqdir; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); reason = (q->flags & INET_FRAG_DROP) ? SKB_DROP_REASON_FRAG_REASM_TIMEOUT : SKB_CONSUMED; WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ fqdir = q->fqdir; f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; q->fqdir = fqdir; f->constructor(q, arg); add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } mod_timer(&q->timer, jiffies + fqdir->timeout); *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); return NULL; } return q; } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ long high_thresh = READ_ONCE(fqdir->high_thresh); struct inet_frag_queue *fq = NULL, *prev; if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; rcu_read_lock(); prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); if (!IS_ERR_OR_NULL(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; } rcu_read_unlock(); return fq; } EXPORT_SYMBOL(inet_frag_find); int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end) { struct sk_buff *last = q->fragments_tail; /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * * Duplicates, however, should be ignored (i.e. skb dropped, but the * queue/fragments kept for later reassembly). */ if (!last) fragrun_create(q, skb); /* First fragment. */ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); } else { /* Binary search. Note that skb can become the first fragment, * but not the last (covered above). */ struct rb_node **rbn, *parent; rbn = &q->rb_fragments.rb_node; do { struct sk_buff *curr; int curr_run_end; parent = *rbn; curr = rb_to_skb(parent); curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else return IPFRAG_OVERLAP; } while (*rbn); /* Here we have parent properly set, and rbn pointing to * one of its NULL left/right children. Insert skb. */ fragcb_clear(skb); rb_link_node(&skb->rbnode, parent, rbn); rb_insert_color(&skb->rbnode, &q->rb_fragments); } FRAG_CB(skb)->ip_defrag_offset = offset; return IPFRAG_OK; } EXPORT_SYMBOL(inet_frag_queue_insert); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); void (*destructor)(struct sk_buff *); unsigned int orig_truesize = 0; struct sk_buff **nextp = NULL; struct sock *sk = skb->sk; int delta; if (sk && is_skb_wmem(skb)) { /* TX: skb->sk might have been passed as argument to * dst->output and must remain valid until tx completes. * * Move sk to reassembled skb and fix up wmem accounting. */ orig_truesize = skb->truesize; destructor = skb->destructor; } if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); if (!fp) { head = skb; goto out_restore_sk; } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; else rb_replace_node(&skb->rbnode, &fp->rbnode, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; if (orig_truesize) { /* prevent skb_morph from releasing sk */ skb->sk = NULL; skb->destructor = NULL; } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); consume_skb(head); head = skb; } WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_restore_sk; delta += head->truesize; if (delta) add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->data_len = head->data_len - plen; clone->len = clone->data_len; head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { nextp = &skb_shinfo(head)->frag_list; } out_restore_sk: if (orig_truesize) { int ts_delta = head->truesize - orig_truesize; /* if this reassembled skb is fragmented later, * fraglist skbs will get skb->sk assigned from head->sk, * and each frag skb will be released via sock_wfree. * * Update sk_wmem_alloc. */ head->sk = sk; head->destructor = destructor; refcount_add(ts_delta, &sk->sk_wmem_alloc); } return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; int sum_truesize; skb_push(head, head->data - skb_network_header(head)); /* Traverse the tree in order, to build frag_list. */ fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &q->rb_fragments); sum_truesize = head->truesize; while (rbn || fp) { /* fp points to the next sk_buff in the current run; * rbn points to the next run. */ /* Go through the current run. */ while (fp) { struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; bool stolen; int delta; sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); if (try_coalesce && skb_try_coalesce(head, fp, &stolen, &delta)) { kfree_skb_partial(fp, stolen); } else { fp->prev = NULL; memset(&fp->rbnode, 0, sizeof(fp->rbnode)); fp->sk = NULL; head->data_len += fp->len; head->len += fp->len; head->truesize += fp->truesize; *nextp = fp; nextp = &fp->next; } fp = next_frag; } /* Move to the next run. */ if (rbn) { struct rb_node *rbnext = rb_next(rbn); fp = rb_to_skb(rbn); rb_erase(rbn, &q->rb_fragments); rbn = rbnext; } } sub_frag_mem_limit(q->fqdir, sum_truesize); *nextp = NULL; skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; head->mono_delivery_time = q->mono_delivery_time; if (sk) refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) { struct sk_buff *head, *skb; head = skb_rb_first(&q->rb_fragments); if (!head) return NULL; skb = FRAG_CB(head)->next_frag; if (skb) rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); else rb_erase(&head->rbnode, &q->rb_fragments); memset(&head->rbnode, 0, sizeof(head->rbnode)); barrier(); if (head == q->fragments_tail) q->fragments_tail = NULL; sub_frag_mem_limit(q->fqdir, head->truesize); return head; } EXPORT_SYMBOL(inet_frag_pull_head);
390 390 390 388 233 237 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 #include <linux/bpf.h> #include <linux/vmalloc.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/namei.h> #include <linux/user_namespace.h> #include <linux/security.h> static bool bpf_ns_capable(struct user_namespace *ns, int cap) { return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN)); } bool bpf_token_capable(const struct bpf_token *token, int cap) { struct user_namespace *userns; /* BPF token allows ns_capable() level of capabilities */ userns = token ? token->userns : &init_user_ns; if (!bpf_ns_capable(userns, cap)) return false; if (token && security_bpf_token_capable(token, cap) < 0) return false; return true; } void bpf_token_inc(struct bpf_token *token) { atomic64_inc(&token->refcnt); } static void bpf_token_free(struct bpf_token *token) { security_bpf_token_free(token); put_user_ns(token->userns); kfree(token); } static void bpf_token_put_deferred(struct work_struct *work) { struct bpf_token *token = container_of(work, struct bpf_token, work); bpf_token_free(token); } void bpf_token_put(struct bpf_token *token) { if (!token) return; if (!atomic64_dec_and_test(&token->refcnt)) return; INIT_WORK(&token->work, bpf_token_put_deferred); schedule_work(&token->work); } static int bpf_token_release(struct inode *inode, struct file *filp) { struct bpf_token *token = filp->private_data; bpf_token_put(token); return 0; } static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_token *token = filp->private_data; u64 mask; BUILD_BUG_ON(__MAX_BPF_CMD >= 64); mask = BIT_ULL(__MAX_BPF_CMD) - 1; if ((token->allowed_cmds & mask) == mask) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; if ((token->allowed_maps & mask) == mask) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; if ((token->allowed_progs & mask) == mask) seq_printf(m, "allowed_progs:\tany\n"); else seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; if ((token->allowed_attachs & mask) == mask) seq_printf(m, "allowed_attachs:\tany\n"); else seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" static const struct inode_operations bpf_token_iops = { }; static const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; int bpf_token_create(union bpf_attr *attr) { struct bpf_mount_opts *mnt_opts; struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; struct file *file; struct path path; struct fd f; umode_t mode; int err, fd; f = fdget(attr->token_create.bpffs_fd); if (!f.file) return -EBADF; path = f.file->f_path; path_get(&path); fdput(f); if (path.dentry != path.mnt->mnt_sb->s_root) { err = -EINVAL; goto out_path; } if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { err = -EINVAL; goto out_path; } err = path_permission(&path, MAY_ACCESS); if (err) goto out_path; userns = path.dentry->d_sb->s_user_ns; /* * Enforce that creators of BPF tokens are in the same user * namespace as the BPF FS instance. This makes reasoning about * permissions a lot easier and we can always relax this later. */ if (current_user_ns() != userns) { err = -EPERM; goto out_path; } if (!ns_capable(userns, CAP_BPF)) { err = -EPERM; goto out_path; } /* Creating BPF token in init_user_ns doesn't make much sense. */ if (current_user_ns() == &init_user_ns) { err = -EOPNOTSUPP; goto out_path; } mnt_opts = path.dentry->d_sb->s_fs_info; if (mnt_opts->delegate_cmds == 0 && mnt_opts->delegate_maps == 0 && mnt_opts->delegate_progs == 0 && mnt_opts->delegate_attachs == 0) { err = -ENOENT; /* no BPF token delegation is set up */ goto out_path; } mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_path; } inode->i_op = &bpf_token_iops; inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); if (IS_ERR(file)) { iput(inode); err = PTR_ERR(file); goto out_path; } token = kzalloc(sizeof(*token), GFP_USER); if (!token) { err = -ENOMEM; goto out_file; } atomic64_set(&token->refcnt, 1); /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; err = security_bpf_token_create(token, attr, &path); if (err) goto out_token; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; goto out_token; } file->private_data = token; fd_install(fd, file); path_put(&path); return fd; out_token: bpf_token_free(token); out_file: fput(file); out_path: path_put(&path); return err; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) { struct fd f = fdget(ufd); struct bpf_token *token; if (!f.file) return ERR_PTR(-EBADF); if (f.file->f_op != &bpf_token_fops) { fdput(f); return ERR_PTR(-EINVAL); } token = f.file->private_data; bpf_token_inc(token); fdput(f); return token; } bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { if (!token) return false; if (!(token->allowed_cmds & BIT_ULL(cmd))) return false; return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) { if (!token || type >= __MAX_BPF_MAP_TYPE) return false; return token->allowed_maps & BIT_ULL(type); } bool bpf_token_allow_prog_type(const struct bpf_token *token, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type) { if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) return false; return (token->allowed_progs & BIT_ULL(prog_type)) && (token->allowed_attachs & BIT_ULL(attach_type)); }
16 35 40 6 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_EXTEND_H #define _NF_CONNTRACK_EXTEND_H #include <linux/slab.h> #include <net/netfilter/nf_conntrack.h> enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if IS_ENABLED(CONFIG_NF_NAT) NF_CT_EXT_NAT, #endif NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_CT_EXT_TSTAMP, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT NF_CT_EXT_TIMEOUT, #endif #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, #endif #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) NF_CT_EXT_SYNPROXY, #endif #if IS_ENABLED(CONFIG_NET_ACT_CT) NF_CT_EXT_ACT_CT, #endif NF_CT_EXT_NUM, }; /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; unsigned int gen_id; char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id) { return (ct->ext && __nf_ct_ext_exist(ct->ext, id)); } void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id); static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id) { struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, id)) return NULL; if (unlikely(ext->gen_id)) return __nf_ct_ext_find(ext, id); return (void *)ct->ext + ct->ext->offset[id]; } /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); /* ext genid. if ext->id != ext_genid, extensions cannot be used * anymore unless conntrack has CONFIRMED bit set. */ extern atomic_t nf_conntrack_ext_genid; void nf_ct_ext_bump_genid(void); #endif /* _NF_CONNTRACK_EXTEND_H */
16 40 72 18 2 4 4 92 21 21 24 82 40 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> #include <linux/io_uring.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @ret: return code * * Allows to trace fixed files/buffers, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files) __field( unsigned, nr_bufs ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @req: pointer to a submitted request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(struct io_kiocb *req, int fd), TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), TP_printk("ring %p, req %p, user_data 0x%llx, fd %d", __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(struct io_kiocb *req, int rw), TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; __entry->rw = rw; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @req: pointer to a deferred request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) __field( u8, opcode ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->data = req->cqe.user_data; __entry->opcode = req->opcode; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", __entry->ctx, __entry->req, __entry->data, __get_str(op_str)) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( void *, link ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->link = link; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @req: pointer to a submitted request * @user_data: user data associated with the request * @res: result of the request * @cflags: completion flags * @extra1: extra 64-bit data for CQE32 * @extra2: extra 64-bit data for CQE32 * */ TRACE_EVENT(io_uring_complete, TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, u64 extra1, u64 extra2), TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) __field( u64, extra1 ) __field( u64, extra2 ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->extra1 = extra1; __entry->extra2 = extra2; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, __entry->res, __entry->cflags, (unsigned long long) __entry->extra1, (unsigned long long) __entry->extra2) ); /** * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_req, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->sq_thread) ); /* * io_uring_poll_arm - called after arming a poll wait if successful * * @req: pointer to the armed request * @mask: request poll events mask * @events: registered events of interest * * Allows to track which fds are waiting for and what are the events of * interest. */ TRACE_EVENT(io_uring_poll_arm, TP_PROTO(struct io_kiocb *req, int mask, int events), TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __field( int, events ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask, __entry->events) ); /* * io_uring_task_add - called after adding a task * * @req: pointer to request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, TP_PROTO(struct io_kiocb *req, int mask), TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask) ); /* * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed * @req: pointer to request * @error: error it failed with * * Allows easier diagnosing of malformed requests in production systems. */ TRACE_EVENT(io_uring_req_failed, TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u8, flags ) __field( u8, ioprio ) __field( u64, off ) __field( u64, addr ) __field( u32, len ) __field( u32, op_flags ) __field( u16, buf_index ) __field( u16, personality ) __field( u32, file_index ) __field( u64, pad1 ) __field( u64, addr3 ) __field( int, error ) __string( op_str, io_uring_get_opcode(sqe->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; __entry->flags = sqe->flags; __entry->ioprio = sqe->ioprio; __entry->off = sqe->off; __entry->addr = sqe->addr; __entry->len = sqe->len; __entry->op_flags = sqe->poll32_events; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; __entry->pad1 = sqe->__pad2[0]; __entry->addr3 = sqe->addr3; __entry->error = error; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, " "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, (unsigned long long) __entry->addr3, __entry->error) ); /* * io_uring_cqe_overflow - a CQE overflowed * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: CQE result * @cflags: CQE flags * @ocqe: pointer to the overflow cqe (if available) * */ TRACE_EVENT(io_uring_cqe_overflow, TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, void *ocqe), TP_ARGS(ctx, user_data, res, cflags, ocqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned long long, user_data ) __field( s32, res ) __field( u32, cflags ) __field( void *, ocqe ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->ocqe = ocqe; ), TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) ); /* * io_uring_task_work_run - ran task work * * @tctx: pointer to a io_uring_task * @count: how many functions it ran * */ TRACE_EVENT(io_uring_task_work_run, TP_PROTO(void *tctx, unsigned int count), TP_ARGS(tctx, count), TP_STRUCT__entry ( __field( void *, tctx ) __field( unsigned int, count ) ), TP_fast_assign( __entry->tctx = tctx; __entry->count = count; ), TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) ); TRACE_EVENT(io_uring_short_write, TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), TP_ARGS(ctx, fpos, wanted, got), TP_STRUCT__entry( __field(void *, ctx) __field(u64, fpos) __field(u64, wanted) __field(u64, got) ), TP_fast_assign( __entry->ctx = ctx; __entry->fpos = fpos; __entry->wanted = wanted; __entry->got = got; ), TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", __entry->ctx, __entry->fpos, __entry->wanted, __entry->got) ); /* * io_uring_local_work_run - ran ring local task work * * @tctx: pointer to a io_uring_ctx * @count: how many functions it ran * @loops: how many loops it ran * */ TRACE_EVENT(io_uring_local_work_run, TP_PROTO(void *ctx, int count, unsigned int loops), TP_ARGS(ctx, count, loops), TP_STRUCT__entry ( __field(void *, ctx ) __field(int, count ) __field(unsigned int, loops ) ), TP_fast_assign( __entry->ctx = ctx; __entry->count = count; __entry->loops = loops; ), TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2445 2459 2460 2462 2442 2459 2445 2462 2462 2430 2439 2461 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> #include <linux/pgalloc_tag.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_MEM_ALLOC_PROFILING &page_alloc_tagging_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG /* * To ensure correct allocation tagging for pages, page_ext should be available * before the first page allocation. Otherwise early task stacks will be * allocated before page_ext initialization and missing tags will be flagged. */ bool early_page_ext __meminitdata = true; #else bool early_page_ext __meminitdata; #endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { kmemleak_alloc(addr, size, 1, flags); return addr; } addr = vzalloc_node(size, nid); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { if (is_vmalloc_addr(addr)) { vfree(addr); } else { struct page *page = virt_to_page(addr); size_t table_size; table_size = page_ext_size * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); }
8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 /* * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * * DRM framebuffer helper functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. * * Authors: * Dave Airlie <airlied@linux.ie> * Jesse Barnes <jesse.barnes@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/console.h> #include <linux/pci.h> #include <linux/sysrq.h> #include <linux/vga_switcheroo.h> #include <drm/drm_atomic.h> #include <drm/drm_drv.h> #include <drm/drm_fb_helper.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_internal.h" static bool drm_fbdev_emulation = true; module_param_named(fbdev_emulation, drm_fbdev_emulation, bool, 0600); MODULE_PARM_DESC(fbdev_emulation, "Enable legacy fbdev emulation [default=true]"); static int drm_fbdev_overalloc = CONFIG_DRM_FBDEV_OVERALLOC; module_param(drm_fbdev_overalloc, int, 0444); MODULE_PARM_DESC(drm_fbdev_overalloc, "Overallocation of the fbdev buffer (%) [default=" __MODULE_STRING(CONFIG_DRM_FBDEV_OVERALLOC) "]"); /* * In order to keep user-space compatibility, we want in certain use-cases * to keep leaking the fbdev physical address to the user-space program * handling the fbdev buffer. * * This is a bad habit, essentially kept to support closed-source OpenGL * drivers that should really be moved into open-source upstream projects * instead of using legacy physical addresses in user space to communicate * with other out-of-tree kernel modules. * * This module_param *should* be removed as soon as possible and be * considered as a broken and legacy behaviour from a modern fbdev device. */ static bool drm_leak_fbdev_smem; #if IS_ENABLED(CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM) module_param_unsafe(drm_leak_fbdev_smem, bool, 0600); MODULE_PARM_DESC(drm_leak_fbdev_smem, "Allow unsafe leaking fbdev physical smem address [default=false]"); #endif static LIST_HEAD(kernel_fb_helper_list); static DEFINE_MUTEX(kernel_fb_helper_lock); /** * DOC: fbdev helpers * * The fb helper functions are useful to provide an fbdev on top of a drm kernel * mode setting driver. They can be used mostly independently from the crtc * helper functions used by many drivers to implement the kernel mode setting * interfaces. * * Drivers that support a dumb buffer with a virtual address and mmap support, * should try out the generic fbdev emulation using drm_fbdev_generic_setup(). * It will automatically set up deferred I/O if the driver requires a shadow * buffer. * * Existing fbdev implementations should restore the fbdev console by using * drm_fb_helper_lastclose() as their &drm_driver.lastclose callback. * They should also notify the fb helper code from updates to the output * configuration by using drm_fb_helper_output_poll_changed() as their * &drm_mode_config_funcs.output_poll_changed callback. New implementations * of fbdev should be build on top of struct &drm_client_funcs, which handles * this automatically. Setting the old callbacks should be avoided. * * For suspend/resume consider using drm_mode_config_helper_suspend() and * drm_mode_config_helper_resume() which takes care of fbdev as well. * * All other functions exported by the fb helper library can be used to * implement the fbdev driver interface by the driver. * * It is possible, though perhaps somewhat tricky, to implement race-free * hotplug detection using the fbdev helpers. The drm_fb_helper_prepare() * helper must be called first to initialize the minimum required to make * hotplug detection work. Drivers also need to make sure to properly set up * the &drm_mode_config.funcs member. After calling drm_kms_helper_poll_init() * it is safe to enable interrupts and start processing hotplug events. At the * same time, drivers should initialize all modeset objects such as CRTCs, * encoders and connectors. To finish up the fbdev helper initialization, the * drm_fb_helper_init() function is called. To probe for all attached displays * and set up an initial configuration using the detected hardware, drivers * should call drm_fb_helper_initial_config(). * * If &drm_framebuffer_funcs.dirty is set, the * drm_fb_helper_{cfb,sys}_{write,fillrect,copyarea,imageblit} functions will * accumulate changes and schedule &drm_fb_helper.dirty_work to run right * away. This worker then calls the dirty() function ensuring that it will * always run in process context since the fb_*() function could be running in * atomic context. If drm_fb_helper_deferred_io() is used as the deferred_io * callback it will also schedule dirty_work with the damage collected from the * mmap page writes. * * Deferred I/O is not compatible with SHMEM. Such drivers should request an * fbdev shadow buffer and call drm_fbdev_generic_setup() instead. */ static void drm_fb_helper_restore_lut_atomic(struct drm_crtc *crtc) { uint16_t *r_base, *g_base, *b_base; if (crtc->funcs->gamma_set == NULL) return; r_base = crtc->gamma_store; g_base = r_base + crtc->gamma_size; b_base = g_base + crtc->gamma_size; crtc->funcs->gamma_set(crtc, r_base, g_base, b_base, crtc->gamma_size, NULL); } /** * drm_fb_helper_debug_enter - implementation for &fb_ops.fb_debug_enter * @info: fbdev registered by the helper */ int drm_fb_helper_debug_enter(struct fb_info *info) { struct drm_fb_helper *helper = info->par; const struct drm_crtc_helper_funcs *funcs; struct drm_mode_set *mode_set; list_for_each_entry(helper, &kernel_fb_helper_list, kernel_fb_list) { mutex_lock(&helper->client.modeset_mutex); drm_client_for_each_modeset(mode_set, &helper->client) { if (!mode_set->crtc->enabled) continue; funcs = mode_set->crtc->helper_private; if (funcs->mode_set_base_atomic == NULL) continue; if (drm_drv_uses_atomic_modeset(mode_set->crtc->dev)) continue; funcs->mode_set_base_atomic(mode_set->crtc, mode_set->fb, mode_set->x, mode_set->y, ENTER_ATOMIC_MODE_SET); } mutex_unlock(&helper->client.modeset_mutex); } return 0; } EXPORT_SYMBOL(drm_fb_helper_debug_enter); /** * drm_fb_helper_debug_leave - implementation for &fb_ops.fb_debug_leave * @info: fbdev registered by the helper */ int drm_fb_helper_debug_leave(struct fb_info *info) { struct drm_fb_helper *helper = info->par; struct drm_client_dev *client = &helper->client; struct drm_device *dev = helper->dev; struct drm_crtc *crtc; const struct drm_crtc_helper_funcs *funcs; struct drm_mode_set *mode_set; struct drm_framebuffer *fb; mutex_lock(&client->modeset_mutex); drm_client_for_each_modeset(mode_set, client) { crtc = mode_set->crtc; if (drm_drv_uses_atomic_modeset(crtc->dev)) continue; funcs = crtc->helper_private; fb = crtc->primary->fb; if (!crtc->enabled) continue; if (!fb) { drm_err(dev, "no fb to restore?\n"); continue; } if (funcs->mode_set_base_atomic == NULL) continue; drm_fb_helper_restore_lut_atomic(mode_set->crtc); funcs->mode_set_base_atomic(mode_set->crtc, fb, crtc->x, crtc->y, LEAVE_ATOMIC_MODE_SET); } mutex_unlock(&client->modeset_mutex); return 0; } EXPORT_SYMBOL(drm_fb_helper_debug_leave); static int __drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper, bool force) { bool do_delayed; int ret; if (!drm_fbdev_emulation || !fb_helper) return -ENODEV; if (READ_ONCE(fb_helper->deferred_setup)) return 0; mutex_lock(&fb_helper->lock); if (force) { /* * Yes this is the _locked version which expects the master lock * to be held. But for forced restores we're intentionally * racing here, see drm_fb_helper_set_par(). */ ret = drm_client_modeset_commit_locked(&fb_helper->client); } else { ret = drm_client_modeset_commit(&fb_helper->client); } do_delayed = fb_helper->delayed_hotplug; if (do_delayed) fb_helper->delayed_hotplug = false; mutex_unlock(&fb_helper->lock); if (do_delayed) drm_fb_helper_hotplug_event(fb_helper); return ret; } /** * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration * @fb_helper: driver-allocated fbdev helper, can be NULL * * This should be called from driver's drm &drm_driver.lastclose callback * when implementing an fbcon on top of kms using this helper. This ensures that * the user isn't greeted with a black screen when e.g. X dies. * * RETURNS: * Zero if everything went ok, negative error code otherwise. */ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) { return __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, false); } EXPORT_SYMBOL(drm_fb_helper_restore_fbdev_mode_unlocked); #ifdef CONFIG_MAGIC_SYSRQ /* emergency restore, don't bother with error reporting */ static void drm_fb_helper_restore_work_fn(struct work_struct *ignored) { struct drm_fb_helper *helper; mutex_lock(&kernel_fb_helper_lock); list_for_each_entry(helper, &kernel_fb_helper_list, kernel_fb_list) { struct drm_device *dev = helper->dev; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) continue; mutex_lock(&helper->lock); drm_client_modeset_commit_locked(&helper->client); mutex_unlock(&helper->lock); } mutex_unlock(&kernel_fb_helper_lock); } static DECLARE_WORK(drm_fb_helper_restore_work, drm_fb_helper_restore_work_fn); static void drm_fb_helper_sysrq(u8 dummy1) { schedule_work(&drm_fb_helper_restore_work); } static const struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { .handler = drm_fb_helper_sysrq, .help_msg = "force-fb(v)", .action_msg = "Restore framebuffer console", }; #else static const struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { }; #endif static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) { struct drm_fb_helper *fb_helper = info->par; mutex_lock(&fb_helper->lock); drm_client_modeset_dpms(&fb_helper->client, dpms_mode); mutex_unlock(&fb_helper->lock); } /** * drm_fb_helper_blank - implementation for &fb_ops.fb_blank * @blank: desired blanking state * @info: fbdev registered by the helper */ int drm_fb_helper_blank(int blank, struct fb_info *info) { if (oops_in_progress) return -EBUSY; switch (blank) { /* Display: On; HSync: On, VSync: On */ case FB_BLANK_UNBLANK: drm_fb_helper_dpms(info, DRM_MODE_DPMS_ON); break; /* Display: Off; HSync: On, VSync: On */ case FB_BLANK_NORMAL: drm_fb_helper_dpms(info, DRM_MODE_DPMS_STANDBY); break; /* Display: Off; HSync: Off, VSync: On */ case FB_BLANK_HSYNC_SUSPEND: drm_fb_helper_dpms(info, DRM_MODE_DPMS_STANDBY); break; /* Display: Off; HSync: On, VSync: Off */ case FB_BLANK_VSYNC_SUSPEND: drm_fb_helper_dpms(info, DRM_MODE_DPMS_SUSPEND); break; /* Display: Off; HSync: Off, VSync: Off */ case FB_BLANK_POWERDOWN: drm_fb_helper_dpms(info, DRM_MODE_DPMS_OFF); break; } return 0; } EXPORT_SYMBOL(drm_fb_helper_blank); static void drm_fb_helper_resume_worker(struct work_struct *work) { struct drm_fb_helper *helper = container_of(work, struct drm_fb_helper, resume_work); console_lock(); fb_set_suspend(helper->info, 0); console_unlock(); } static void drm_fb_helper_fb_dirty(struct drm_fb_helper *helper) { struct drm_device *dev = helper->dev; struct drm_clip_rect *clip = &helper->damage_clip; struct drm_clip_rect clip_copy; unsigned long flags; int ret; if (drm_WARN_ON_ONCE(dev, !helper->funcs->fb_dirty)) return; spin_lock_irqsave(&helper->damage_lock, flags); clip_copy = *clip; clip->x1 = clip->y1 = ~0; clip->x2 = clip->y2 = 0; spin_unlock_irqrestore(&helper->damage_lock, flags); ret = helper->funcs->fb_dirty(helper, &clip_copy); if (ret) goto err; return; err: /* * Restore damage clip rectangle on errors. The next run * of the damage worker will perform the update. */ spin_lock_irqsave(&helper->damage_lock, flags); clip->x1 = min_t(u32, clip->x1, clip_copy.x1); clip->y1 = min_t(u32, clip->y1, clip_copy.y1); clip->x2 = max_t(u32, clip->x2, clip_copy.x2); clip->y2 = max_t(u32, clip->y2, clip_copy.y2); spin_unlock_irqrestore(&helper->damage_lock, flags); } static void drm_fb_helper_damage_work(struct work_struct *work) { struct drm_fb_helper *helper = container_of(work, struct drm_fb_helper, damage_work); drm_fb_helper_fb_dirty(helper); } /** * drm_fb_helper_prepare - setup a drm_fb_helper structure * @dev: DRM device * @helper: driver-allocated fbdev helper structure to set up * @preferred_bpp: Preferred bits per pixel for the device. * @funcs: pointer to structure of functions associate with this helper * * Sets up the bare minimum to make the framebuffer helper usable. This is * useful to implement race-free initialization of the polling helpers. */ void drm_fb_helper_prepare(struct drm_device *dev, struct drm_fb_helper *helper, unsigned int preferred_bpp, const struct drm_fb_helper_funcs *funcs) { /* * Pick a preferred bpp of 32 if no value has been given. This * will select XRGB8888 for the framebuffer formats. All drivers * have to support XRGB8888 for backwards compatibility with legacy * userspace, so it's the safe choice here. * * TODO: Replace struct drm_mode_config.preferred_depth and this * bpp value with a preferred format that is given as struct * drm_format_info. Then derive all other values from the * format. */ if (!preferred_bpp) preferred_bpp = 32; INIT_LIST_HEAD(&helper->kernel_fb_list); spin_lock_init(&helper->damage_lock); INIT_WORK(&helper->resume_work, drm_fb_helper_resume_worker); INIT_WORK(&helper->damage_work, drm_fb_helper_damage_work); helper->damage_clip.x1 = helper->damage_clip.y1 = ~0; mutex_init(&helper->lock); helper->funcs = funcs; helper->dev = dev; helper->preferred_bpp = preferred_bpp; } EXPORT_SYMBOL(drm_fb_helper_prepare); /** * drm_fb_helper_unprepare - clean up a drm_fb_helper structure * @fb_helper: driver-allocated fbdev helper structure to set up * * Cleans up the framebuffer helper. Inverse of drm_fb_helper_prepare(). */ void drm_fb_helper_unprepare(struct drm_fb_helper *fb_helper) { mutex_destroy(&fb_helper->lock); } EXPORT_SYMBOL(drm_fb_helper_unprepare); /** * drm_fb_helper_init - initialize a &struct drm_fb_helper * @dev: drm device * @fb_helper: driver-allocated fbdev helper structure to initialize * * This allocates the structures for the fbdev helper with the given limits. * Note that this won't yet touch the hardware (through the driver interfaces) * nor register the fbdev. This is only done in drm_fb_helper_initial_config() * to allow driver writes more control over the exact init sequence. * * Drivers must call drm_fb_helper_prepare() before calling this function. * * RETURNS: * Zero if everything went ok, nonzero otherwise. */ int drm_fb_helper_init(struct drm_device *dev, struct drm_fb_helper *fb_helper) { int ret; /* * If this is not the generic fbdev client, initialize a drm_client * without callbacks so we can use the modesets. */ if (!fb_helper->client.funcs) { ret = drm_client_init(dev, &fb_helper->client, "drm_fb_helper", NULL); if (ret) return ret; } dev->fb_helper = fb_helper; return 0; } EXPORT_SYMBOL(drm_fb_helper_init); /** * drm_fb_helper_alloc_info - allocate fb_info and some of its members * @fb_helper: driver-allocated fbdev helper * * A helper to alloc fb_info and the member cmap. Called by the driver * within the fb_probe fb_helper callback function. Drivers do not * need to release the allocated fb_info structure themselves, this is * automatically done when calling drm_fb_helper_fini(). * * RETURNS: * fb_info pointer if things went okay, pointer containing error code * otherwise */ struct fb_info *drm_fb_helper_alloc_info(struct drm_fb_helper *fb_helper) { struct device *dev = fb_helper->dev->dev; struct fb_info *info; int ret; info = framebuffer_alloc(0, dev); if (!info) return ERR_PTR(-ENOMEM); if (!drm_leak_fbdev_smem) info->flags |= FBINFO_HIDE_SMEM_START; ret = fb_alloc_cmap(&info->cmap, 256, 0); if (ret) goto err_release; fb_helper->info = info; info->skip_vt_switch = true; return info; err_release: framebuffer_release(info); return ERR_PTR(ret); } EXPORT_SYMBOL(drm_fb_helper_alloc_info); /** * drm_fb_helper_release_info - release fb_info and its members * @fb_helper: driver-allocated fbdev helper * * A helper to release fb_info and the member cmap. Drivers do not * need to release the allocated fb_info structure themselves, this is * automatically done when calling drm_fb_helper_fini(). */ void drm_fb_helper_release_info(struct drm_fb_helper *fb_helper) { struct fb_info *info = fb_helper->info; if (!info) return; fb_helper->info = NULL; if (info->cmap.len) fb_dealloc_cmap(&info->cmap); framebuffer_release(info); } EXPORT_SYMBOL(drm_fb_helper_release_info); /** * drm_fb_helper_unregister_info - unregister fb_info framebuffer device * @fb_helper: driver-allocated fbdev helper, can be NULL * * A wrapper around unregister_framebuffer, to release the fb_info * framebuffer device. This must be called before releasing all resources for * @fb_helper by calling drm_fb_helper_fini(). */ void drm_fb_helper_unregister_info(struct drm_fb_helper *fb_helper) { if (fb_helper && fb_helper->info) unregister_framebuffer(fb_helper->info); } EXPORT_SYMBOL(drm_fb_helper_unregister_info); /** * drm_fb_helper_fini - finialize a &struct drm_fb_helper * @fb_helper: driver-allocated fbdev helper, can be NULL * * This cleans up all remaining resources associated with @fb_helper. */ void drm_fb_helper_fini(struct drm_fb_helper *fb_helper) { if (!fb_helper) return; fb_helper->dev->fb_helper = NULL; if (!drm_fbdev_emulation) return; cancel_work_sync(&fb_helper->resume_work); cancel_work_sync(&fb_helper->damage_work); drm_fb_helper_release_info(fb_helper); mutex_lock(&kernel_fb_helper_lock); if (!list_empty(&fb_helper->kernel_fb_list)) { list_del(&fb_helper->kernel_fb_list); if (list_empty(&kernel_fb_helper_list)) unregister_sysrq_key('v', &sysrq_drm_fb_helper_restore_op); } mutex_unlock(&kernel_fb_helper_lock); if (!fb_helper->client.funcs) drm_client_release(&fb_helper->client); } EXPORT_SYMBOL(drm_fb_helper_fini); static void drm_fb_helper_add_damage_clip(struct drm_fb_helper *helper, u32 x, u32 y, u32 width, u32 height) { struct drm_clip_rect *clip = &helper->damage_clip; unsigned long flags; spin_lock_irqsave(&helper->damage_lock, flags); clip->x1 = min_t(u32, clip->x1, x); clip->y1 = min_t(u32, clip->y1, y); clip->x2 = max_t(u32, clip->x2, x + width); clip->y2 = max_t(u32, clip->y2, y + height); spin_unlock_irqrestore(&helper->damage_lock, flags); } static void drm_fb_helper_damage(struct drm_fb_helper *helper, u32 x, u32 y, u32 width, u32 height) { drm_fb_helper_add_damage_clip(helper, x, y, width, height); schedule_work(&helper->damage_work); } /* * Convert memory region into area of scanlines and pixels per * scanline. The parameters off and len must not reach beyond * the end of the framebuffer. */ static void drm_fb_helper_memory_range_to_clip(struct fb_info *info, off_t off, size_t len, struct drm_rect *clip) { u32 line_length = info->fix.line_length; u32 fb_height = info->var.yres; off_t end = off + len; u32 x1 = 0; u32 y1 = off / line_length; u32 x2 = info->var.xres; u32 y2 = DIV_ROUND_UP(end, line_length); /* Don't allow any of them beyond the bottom bound of display area */ if (y1 > fb_height) y1 = fb_height; if (y2 > fb_height) y2 = fb_height; if ((y2 - y1) == 1) { /* * We've only written to a single scanline. Try to reduce * the number of horizontal pixels that need an update. */ off_t bit_off = (off % line_length) * 8; off_t bit_end = (end % line_length) * 8; x1 = bit_off / info->var.bits_per_pixel; x2 = DIV_ROUND_UP(bit_end, info->var.bits_per_pixel); } drm_rect_init(clip, x1, y1, x2 - x1, y2 - y1); } /* Don't use in new code. */ void drm_fb_helper_damage_range(struct fb_info *info, off_t off, size_t len) { struct drm_fb_helper *fb_helper = info->par; struct drm_rect damage_area; drm_fb_helper_memory_range_to_clip(info, off, len, &damage_area); drm_fb_helper_damage(fb_helper, damage_area.x1, damage_area.y1, drm_rect_width(&damage_area), drm_rect_height(&damage_area)); } EXPORT_SYMBOL(drm_fb_helper_damage_range); /* Don't use in new code. */ void drm_fb_helper_damage_area(struct fb_info *info, u32 x, u32 y, u32 width, u32 height) { struct drm_fb_helper *fb_helper = info->par; drm_fb_helper_damage(fb_helper, x, y, width, height); } EXPORT_SYMBOL(drm_fb_helper_damage_area); /** * drm_fb_helper_deferred_io() - fbdev deferred_io callback function * @info: fb_info struct pointer * @pagereflist: list of mmap framebuffer pages that have to be flushed * * This function is used as the &fb_deferred_io.deferred_io * callback function for flushing the fbdev mmap writes. */ void drm_fb_helper_deferred_io(struct fb_info *info, struct list_head *pagereflist) { struct drm_fb_helper *helper = info->par; unsigned long start, end, min_off, max_off, total_size; struct fb_deferred_io_pageref *pageref; struct drm_rect damage_area; min_off = ULONG_MAX; max_off = 0; list_for_each_entry(pageref, pagereflist, list) { start = pageref->offset; end = start + PAGE_SIZE; min_off = min(min_off, start); max_off = max(max_off, end); } /* * As we can only track pages, we might reach beyond the end * of the screen and account for non-existing scanlines. Hence, * keep the covered memory area within the screen buffer. */ if (info->screen_size) total_size = info->screen_size; else total_size = info->fix.smem_len; max_off = min(max_off, total_size); if (min_off < max_off) { drm_fb_helper_memory_range_to_clip(info, min_off, max_off - min_off, &damage_area); drm_fb_helper_damage(helper, damage_area.x1, damage_area.y1, drm_rect_width(&damage_area), drm_rect_height(&damage_area)); } } EXPORT_SYMBOL(drm_fb_helper_deferred_io); /** * drm_fb_helper_set_suspend - wrapper around fb_set_suspend * @fb_helper: driver-allocated fbdev helper, can be NULL * @suspend: whether to suspend or resume * * A wrapper around fb_set_suspend implemented by fbdev core. * Use drm_fb_helper_set_suspend_unlocked() if you don't need to take * the lock yourself */ void drm_fb_helper_set_suspend(struct drm_fb_helper *fb_helper, bool suspend) { if (fb_helper && fb_helper->info) fb_set_suspend(fb_helper->info, suspend); } EXPORT_SYMBOL(drm_fb_helper_set_suspend); /** * drm_fb_helper_set_suspend_unlocked - wrapper around fb_set_suspend that also * takes the console lock * @fb_helper: driver-allocated fbdev helper, can be NULL * @suspend: whether to suspend or resume * * A wrapper around fb_set_suspend() that takes the console lock. If the lock * isn't available on resume, a worker is tasked with waiting for the lock * to become available. The console lock can be pretty contented on resume * due to all the printk activity. * * This function can be called multiple times with the same state since * &fb_info.state is checked to see if fbdev is running or not before locking. * * Use drm_fb_helper_set_suspend() if you need to take the lock yourself. */ void drm_fb_helper_set_suspend_unlocked(struct drm_fb_helper *fb_helper, bool suspend) { if (!fb_helper || !fb_helper->info) return; /* make sure there's no pending/ongoing resume */ flush_work(&fb_helper->resume_work); if (suspend) { if (fb_helper->info->state != FBINFO_STATE_RUNNING) return; console_lock(); } else { if (fb_helper->info->state == FBINFO_STATE_RUNNING) return; if (!console_trylock()) { schedule_work(&fb_helper->resume_work); return; } } fb_set_suspend(fb_helper->info, suspend); console_unlock(); } EXPORT_SYMBOL(drm_fb_helper_set_suspend_unlocked); static int setcmap_pseudo_palette(struct fb_cmap *cmap, struct fb_info *info) { u32 *palette = (u32 *)info->pseudo_palette; int i; if (cmap->start + cmap->len > 16) return -EINVAL; for (i = 0; i < cmap->len; ++i) { u16 red = cmap->red[i]; u16 green = cmap->green[i]; u16 blue = cmap->blue[i]; u32 value; red >>= 16 - info->var.red.length; green >>= 16 - info->var.green.length; blue >>= 16 - info->var.blue.length; value = (red << info->var.red.offset) | (green << info->var.green.offset) | (blue << info->var.blue.offset); if (info->var.transp.length > 0) { u32 mask = (1 << info->var.transp.length) - 1; mask <<= info->var.transp.offset; value |= mask; } palette[cmap->start + i] = value; } return 0; } static int setcmap_legacy(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_mode_set *modeset; struct drm_crtc *crtc; u16 *r, *g, *b; int ret = 0; drm_modeset_lock_all(fb_helper->dev); drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; if (!crtc->funcs->gamma_set || !crtc->gamma_size) { ret = -EINVAL; goto out; } if (cmap->start + cmap->len > crtc->gamma_size) { ret = -EINVAL; goto out; } r = crtc->gamma_store; g = r + crtc->gamma_size; b = g + crtc->gamma_size; memcpy(r + cmap->start, cmap->red, cmap->len * sizeof(*r)); memcpy(g + cmap->start, cmap->green, cmap->len * sizeof(*g)); memcpy(b + cmap->start, cmap->blue, cmap->len * sizeof(*b)); ret = crtc->funcs->gamma_set(crtc, r, g, b, crtc->gamma_size, NULL); if (ret) goto out; } out: drm_modeset_unlock_all(fb_helper->dev); return ret; } static struct drm_property_blob *setcmap_new_gamma_lut(struct drm_crtc *crtc, struct fb_cmap *cmap) { struct drm_device *dev = crtc->dev; struct drm_property_blob *gamma_lut; struct drm_color_lut *lut; int size = crtc->gamma_size; int i; if (!size || cmap->start + cmap->len > size) return ERR_PTR(-EINVAL); gamma_lut = drm_property_create_blob(dev, sizeof(*lut) * size, NULL); if (IS_ERR(gamma_lut)) return gamma_lut; lut = gamma_lut->data; if (cmap->start || cmap->len != size) { u16 *r = crtc->gamma_store; u16 *g = r + crtc->gamma_size; u16 *b = g + crtc->gamma_size; for (i = 0; i < cmap->start; i++) { lut[i].red = r[i]; lut[i].green = g[i]; lut[i].blue = b[i]; } for (i = cmap->start + cmap->len; i < size; i++) { lut[i].red = r[i]; lut[i].green = g[i]; lut[i].blue = b[i]; } } for (i = 0; i < cmap->len; i++) { lut[cmap->start + i].red = cmap->red[i]; lut[cmap->start + i].green = cmap->green[i]; lut[cmap->start + i].blue = cmap->blue[i]; } return gamma_lut; } static int setcmap_atomic(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; struct drm_property_blob *gamma_lut = NULL; struct drm_modeset_acquire_ctx ctx; struct drm_crtc_state *crtc_state; struct drm_atomic_state *state; struct drm_mode_set *modeset; struct drm_crtc *crtc; u16 *r, *g, *b; bool replaced; int ret = 0; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; if (!gamma_lut) gamma_lut = setcmap_new_gamma_lut(crtc, cmap); if (IS_ERR(gamma_lut)) { ret = PTR_ERR(gamma_lut); gamma_lut = NULL; goto out_state; } crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto out_state; } /* * FIXME: This always uses gamma_lut. Some HW have only * degamma_lut, in which case we should reset gamma_lut and set * degamma_lut. See drm_crtc_legacy_gamma_set(). */ replaced = drm_property_replace_blob(&crtc_state->degamma_lut, NULL); replaced |= drm_property_replace_blob(&crtc_state->ctm, NULL); replaced |= drm_property_replace_blob(&crtc_state->gamma_lut, gamma_lut); crtc_state->color_mgmt_changed |= replaced; } ret = drm_atomic_commit(state); if (ret) goto out_state; drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; r = crtc->gamma_store; g = r + crtc->gamma_size; b = g + crtc->gamma_size; memcpy(r + cmap->start, cmap->red, cmap->len * sizeof(*r)); memcpy(g + cmap->start, cmap->green, cmap->len * sizeof(*g)); memcpy(b + cmap->start, cmap->blue, cmap->len * sizeof(*b)); } out_state: if (ret == -EDEADLK) goto backoff; drm_property_blob_put(gamma_lut); drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } /** * drm_fb_helper_setcmap - implementation for &fb_ops.fb_setcmap * @cmap: cmap to set * @info: fbdev registered by the helper */ int drm_fb_helper_setcmap(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; int ret; if (oops_in_progress) return -EBUSY; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } mutex_lock(&fb_helper->client.modeset_mutex); if (info->fix.visual == FB_VISUAL_TRUECOLOR) ret = setcmap_pseudo_palette(cmap, info); else if (drm_drv_uses_atomic_modeset(fb_helper->dev)) ret = setcmap_atomic(cmap, info); else ret = setcmap_legacy(cmap, info); mutex_unlock(&fb_helper->client.modeset_mutex); drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_setcmap); /** * drm_fb_helper_ioctl - legacy ioctl implementation * @info: fbdev registered by the helper * @cmd: ioctl command * @arg: ioctl argument * * A helper to implement the standard fbdev ioctl. Only * FBIO_WAITFORVSYNC is implemented for now. */ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; struct drm_crtc *crtc; int ret = 0; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } switch (cmd) { case FBIO_WAITFORVSYNC: /* * Only consider the first CRTC. * * This ioctl is supposed to take the CRTC number as * an argument, but in fbdev times, what that number * was supposed to be was quite unclear, different * drivers were passing that argument differently * (some by reference, some by value), and most of the * userspace applications were just hardcoding 0 as an * argument. * * The first CRTC should be the integrated panel on * most drivers, so this is the best choice we can * make. If we're not smart enough here, one should * just consider switch the userspace to KMS. */ crtc = fb_helper->client.modesets[0].crtc; /* * Only wait for a vblank event if the CRTC is * enabled, otherwise just don't do anythintg, * not even report an error. */ ret = drm_crtc_vblank_get(crtc); if (!ret) { drm_crtc_wait_one_vblank(crtc); drm_crtc_vblank_put(crtc); } ret = 0; break; default: ret = -ENOTTY; } drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_ioctl); static bool drm_fb_pixel_format_equal(const struct fb_var_screeninfo *var_1, const struct fb_var_screeninfo *var_2) { return var_1->bits_per_pixel == var_2->bits_per_pixel && var_1->grayscale == var_2->grayscale && var_1->red.offset == var_2->red.offset && var_1->red.length == var_2->red.length && var_1->red.msb_right == var_2->red.msb_right && var_1->green.offset == var_2->green.offset && var_1->green.length == var_2->green.length && var_1->green.msb_right == var_2->green.msb_right && var_1->blue.offset == var_2->blue.offset && var_1->blue.length == var_2->blue.length && var_1->blue.msb_right == var_2->blue.msb_right && var_1->transp.offset == var_2->transp.offset && var_1->transp.length == var_2->transp.length && var_1->transp.msb_right == var_2->transp.msb_right; } static void drm_fb_helper_fill_pixel_fmt(struct fb_var_screeninfo *var, const struct drm_format_info *format) { u8 depth = format->depth; if (format->is_color_indexed) { var->red.offset = 0; var->green.offset = 0; var->blue.offset = 0; var->red.length = depth; var->green.length = depth; var->blue.length = depth; var->transp.offset = 0; var->transp.length = 0; return; } switch (depth) { case 15: var->red.offset = 10; var->green.offset = 5; var->blue.offset = 0; var->red.length = 5; var->green.length = 5; var->blue.length = 5; var->transp.offset = 15; var->transp.length = 1; break; case 16: var->red.offset = 11; var->green.offset = 5; var->blue.offset = 0; var->red.length = 5; var->green.length = 6; var->blue.length = 5; var->transp.offset = 0; break; case 24: var->red.offset = 16; var->green.offset = 8; var->blue.offset = 0; var->red.length = 8; var->green.length = 8; var->blue.length = 8; var->transp.offset = 0; var->transp.length = 0; break; case 32: var->red.offset = 16; var->green.offset = 8; var->blue.offset = 0; var->red.length = 8; var->green.length = 8; var->blue.length = 8; var->transp.offset = 24; var->transp.length = 8; break; default: break; } } static void __fill_var(struct fb_var_screeninfo *var, struct fb_info *info, struct drm_framebuffer *fb) { int i; var->xres_virtual = fb->width; var->yres_virtual = fb->height; var->accel_flags = 0; var->bits_per_pixel = drm_format_info_bpp(fb->format, 0); var->height = info->var.height; var->width = info->var.width; var->left_margin = var->right_margin = 0; var->upper_margin = var->lower_margin = 0; var->hsync_len = var->vsync_len = 0; var->sync = var->vmode = 0; var->rotate = 0; var->colorspace = 0; for (i = 0; i < 4; i++) var->reserved[i] = 0; } /** * drm_fb_helper_check_var - implementation for &fb_ops.fb_check_var * @var: screeninfo to check * @info: fbdev registered by the helper */ int drm_fb_helper_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_framebuffer *fb = fb_helper->fb; const struct drm_format_info *format = fb->format; struct drm_device *dev = fb_helper->dev; unsigned int bpp; if (in_dbg_master()) return -EINVAL; if (var->pixclock != 0) { drm_dbg_kms(dev, "fbdev emulation doesn't support changing the pixel clock, value of pixclock is ignored\n"); var->pixclock = 0; } switch (format->format) { case DRM_FORMAT_C1: case DRM_FORMAT_C2: case DRM_FORMAT_C4: /* supported format with sub-byte pixels */ break; default: if ((drm_format_info_block_width(format, 0) > 1) || (drm_format_info_block_height(format, 0) > 1)) return -EINVAL; break; } /* * Changes struct fb_var_screeninfo are currently not pushed back * to KMS, hence fail if different settings are requested. */ bpp = drm_format_info_bpp(format, 0); if (var->bits_per_pixel > bpp || var->xres > fb->width || var->yres > fb->height || var->xres_virtual > fb->width || var->yres_virtual > fb->height) { drm_dbg_kms(dev, "fb requested width/height/bpp can't fit in current fb " "request %dx%d-%d (virtual %dx%d) > %dx%d-%d\n", var->xres, var->yres, var->bits_per_pixel, var->xres_virtual, var->yres_virtual, fb->width, fb->height, bpp); return -EINVAL; } __fill_var(var, info, fb); /* * fb_pan_display() validates this, but fb_set_par() doesn't and just * falls over. Note that __fill_var above adjusts y/res_virtual. */ if (var->yoffset > var->yres_virtual - var->yres || var->xoffset > var->xres_virtual - var->xres) return -EINVAL; /* We neither support grayscale nor FOURCC (also stored in here). */ if (var->grayscale > 0) return -EINVAL; if (var->nonstd) return -EINVAL; /* * Workaround for SDL 1.2, which is known to be setting all pixel format * fields values to zero in some cases. We treat this situation as a * kind of "use some reasonable autodetected values". */ if (!var->red.offset && !var->green.offset && !var->blue.offset && !var->transp.offset && !var->red.length && !var->green.length && !var->blue.length && !var->transp.length && !var->red.msb_right && !var->green.msb_right && !var->blue.msb_right && !var->transp.msb_right) { drm_fb_helper_fill_pixel_fmt(var, format); } /* * drm fbdev emulation doesn't support changing the pixel format at all, * so reject all pixel format changing requests. */ if (!drm_fb_pixel_format_equal(var, &info->var)) { drm_dbg_kms(dev, "fbdev emulation doesn't support changing the pixel format\n"); return -EINVAL; } return 0; } EXPORT_SYMBOL(drm_fb_helper_check_var); /** * drm_fb_helper_set_par - implementation for &fb_ops.fb_set_par * @info: fbdev registered by the helper * * This will let fbcon do the mode init and is called at initialization time by * the fbdev core when registering the driver, and later on through the hotplug * callback. */ int drm_fb_helper_set_par(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct fb_var_screeninfo *var = &info->var; bool force; if (oops_in_progress) return -EBUSY; /* * Normally we want to make sure that a kms master takes precedence over * fbdev, to avoid fbdev flickering and occasionally stealing the * display status. But Xorg first sets the vt back to text mode using * the KDSET IOCTL with KD_TEXT, and only after that drops the master * status when exiting. * * In the past this was caught by drm_fb_helper_lastclose(), but on * modern systems where logind always keeps a drm fd open to orchestrate * the vt switching, this doesn't work. * * To not break the userspace ABI we have this special case here, which * is only used for the above case. Everything else uses the normal * commit function, which ensures that we never steal the display from * an active drm master. */ force = var->activate & FB_ACTIVATE_KD_TEXT; __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, force); return 0; } EXPORT_SYMBOL(drm_fb_helper_set_par); static void pan_set(struct drm_fb_helper *fb_helper, int x, int y) { struct drm_mode_set *mode_set; mutex_lock(&fb_helper->client.modeset_mutex); drm_client_for_each_modeset(mode_set, &fb_helper->client) { mode_set->x = x; mode_set->y = y; } mutex_unlock(&fb_helper->client.modeset_mutex); } static int pan_display_atomic(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; int ret; pan_set(fb_helper, var->xoffset, var->yoffset); ret = drm_client_modeset_commit_locked(&fb_helper->client); if (!ret) { info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; } else pan_set(fb_helper, info->var.xoffset, info->var.yoffset); return ret; } static int pan_display_legacy(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_client_dev *client = &fb_helper->client; struct drm_mode_set *modeset; int ret = 0; mutex_lock(&client->modeset_mutex); drm_modeset_lock_all(fb_helper->dev); drm_client_for_each_modeset(modeset, client) { modeset->x = var->xoffset; modeset->y = var->yoffset; if (modeset->num_connectors) { ret = drm_mode_set_config_internal(modeset); if (!ret) { info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; } } } drm_modeset_unlock_all(fb_helper->dev); mutex_unlock(&client->modeset_mutex); return ret; } /** * drm_fb_helper_pan_display - implementation for &fb_ops.fb_pan_display * @var: updated screen information * @info: fbdev registered by the helper */ int drm_fb_helper_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; int ret; if (oops_in_progress) return -EBUSY; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } if (drm_drv_uses_atomic_modeset(dev)) ret = pan_display_atomic(var, info); else ret = pan_display_legacy(var, info); drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_pan_display); static uint32_t drm_fb_helper_find_format(struct drm_fb_helper *fb_helper, const uint32_t *formats, size_t format_count, uint32_t bpp, uint32_t depth) { struct drm_device *dev = fb_helper->dev; uint32_t format; size_t i; /* * Do not consider YUV or other complicated formats * for framebuffers. This means only legacy formats * are supported (fmt->depth is a legacy field), but * the framebuffer emulation can only deal with such * formats, specifically RGB/BGA formats. */ format = drm_mode_legacy_fb_format(bpp, depth); if (!format) goto err; for (i = 0; i < format_count; ++i) { if (formats[i] == format) return format; } err: /* We found nothing. */ drm_warn(dev, "bpp/depth value of %u/%u not supported\n", bpp, depth); return DRM_FORMAT_INVALID; } static uint32_t drm_fb_helper_find_color_mode_format(struct drm_fb_helper *fb_helper, const uint32_t *formats, size_t format_count, unsigned int color_mode) { struct drm_device *dev = fb_helper->dev; uint32_t bpp, depth; switch (color_mode) { case 1: case 2: case 4: case 8: case 16: case 24: bpp = depth = color_mode; break; case 15: bpp = 16; depth = 15; break; case 32: bpp = 32; depth = 24; break; default: drm_info(dev, "unsupported color mode of %d\n", color_mode); return DRM_FORMAT_INVALID; } return drm_fb_helper_find_format(fb_helper, formats, format_count, bpp, depth); } static int __drm_fb_helper_find_sizes(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; int crtc_count = 0; struct drm_connector_list_iter conn_iter; struct drm_connector *connector; struct drm_mode_set *mode_set; uint32_t surface_format = DRM_FORMAT_INVALID; const struct drm_format_info *info; memset(sizes, 0, sizeof(*sizes)); sizes->fb_width = (u32)-1; sizes->fb_height = (u32)-1; drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; struct drm_plane *plane = crtc->primary; drm_dbg_kms(dev, "test CRTC %u primary plane\n", drm_crtc_index(crtc)); drm_connector_list_iter_begin(fb_helper->dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_cmdline_mode *cmdline_mode = &connector->cmdline_mode; if (!cmdline_mode->bpp_specified) continue; surface_format = drm_fb_helper_find_color_mode_format(fb_helper, plane->format_types, plane->format_count, cmdline_mode->bpp); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ } drm_connector_list_iter_end(&conn_iter); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ /* try preferred color mode */ surface_format = drm_fb_helper_find_color_mode_format(fb_helper, plane->format_types, plane->format_count, fb_helper->preferred_bpp); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ } if (surface_format == DRM_FORMAT_INVALID) { /* * If none of the given color modes works, fall back * to XRGB8888. Drivers are expected to provide this * format for compatibility with legacy applications. */ drm_warn(dev, "No compatible format found\n"); surface_format = drm_driver_legacy_fb_format(dev, 32, 24); } info = drm_format_info(surface_format); sizes->surface_bpp = drm_format_info_bpp(info, 0); sizes->surface_depth = info->depth; /* first up get a count of crtcs now in use and new min/maxes width/heights */ crtc_count = 0; drm_client_for_each_modeset(mode_set, client) { struct drm_display_mode *desired_mode; int x, y, j; /* in case of tile group, are we the last tile vert or horiz? * If no tile group you are always the last one both vertically * and horizontally */ bool lastv = true, lasth = true; desired_mode = mode_set->mode; if (!desired_mode) continue; crtc_count++; x = mode_set->x; y = mode_set->y; sizes->surface_width = max_t(u32, desired_mode->hdisplay + x, sizes->surface_width); sizes->surface_height = max_t(u32, desired_mode->vdisplay + y, sizes->surface_height); for (j = 0; j < mode_set->num_connectors; j++) { struct drm_connector *connector = mode_set->connectors[j]; if (connector->has_tile && desired_mode->hdisplay == connector->tile_h_size && desired_mode->vdisplay == connector->tile_v_size) { lasth = (connector->tile_h_loc == (connector->num_h_tile - 1)); lastv = (connector->tile_v_loc == (connector->num_v_tile - 1)); /* cloning to multiple tiles is just crazy-talk, so: */ break; } } if (lasth) sizes->fb_width = min_t(u32, desired_mode->hdisplay + x, sizes->fb_width); if (lastv) sizes->fb_height = min_t(u32, desired_mode->vdisplay + y, sizes->fb_height); } if (crtc_count == 0 || sizes->fb_width == -1 || sizes->fb_height == -1) { drm_info(dev, "Cannot find any crtc or sizes\n"); return -EAGAIN; } return 0; } static int drm_fb_helper_find_sizes(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_mode_config *config = &dev->mode_config; int ret; mutex_lock(&client->modeset_mutex); ret = __drm_fb_helper_find_sizes(fb_helper, sizes); mutex_unlock(&client->modeset_mutex); if (ret) return ret; /* Handle our overallocation */ sizes->surface_height *= drm_fbdev_overalloc; sizes->surface_height /= 100; if (sizes->surface_height > config->max_height) { drm_dbg_kms(dev, "Fbdev over-allocation too large; clamping height to %d\n", config->max_height); sizes->surface_height = config->max_height; } return 0; } /* * Allocates the backing storage and sets up the fbdev info structure through * the ->fb_probe callback. */ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_fb_helper_surface_size sizes; int ret; ret = drm_fb_helper_find_sizes(fb_helper, &sizes); if (ret) { /* First time: disable all crtc's.. */ if (!fb_helper->deferred_setup) drm_client_modeset_commit(client); return ret; } /* push down into drivers */ ret = (*fb_helper->funcs->fb_probe)(fb_helper, &sizes); if (ret < 0) return ret; strcpy(fb_helper->fb->comm, "[fbcon]"); /* Set the fb info for vgaswitcheroo clients. Does nothing otherwise. */ if (dev_is_pci(dev->dev)) vga_switcheroo_client_fb_set(to_pci_dev(dev->dev), fb_helper->info); return 0; } static void drm_fb_helper_fill_fix(struct fb_info *info, uint32_t pitch, bool is_color_indexed) { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.visual = is_color_indexed ? FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR; info->fix.mmio_start = 0; info->fix.mmio_len = 0; info->fix.type_aux = 0; info->fix.xpanstep = 1; /* doing it in hw */ info->fix.ypanstep = 1; /* doing it in hw */ info->fix.ywrapstep = 0; info->fix.accel = FB_ACCEL_NONE; info->fix.line_length = pitch; } static void drm_fb_helper_fill_var(struct fb_info *info, struct drm_fb_helper *fb_helper, uint32_t fb_width, uint32_t fb_height) { struct drm_framebuffer *fb = fb_helper->fb; const struct drm_format_info *format = fb->format; switch (format->format) { case DRM_FORMAT_C1: case DRM_FORMAT_C2: case DRM_FORMAT_C4: /* supported format with sub-byte pixels */ break; default: WARN_ON((drm_format_info_block_width(format, 0) > 1) || (drm_format_info_block_height(format, 0) > 1)); break; } info->pseudo_palette = fb_helper->pseudo_palette; info->var.xoffset = 0; info->var.yoffset = 0; __fill_var(&info->var, info, fb); info->var.activate = FB_ACTIVATE_NOW; drm_fb_helper_fill_pixel_fmt(&info->var, format); info->var.xres = fb_width; info->var.yres = fb_height; } /** * drm_fb_helper_fill_info - initializes fbdev information * @info: fbdev instance to set up * @fb_helper: fb helper instance to use as template * @sizes: describes fbdev size and scanout surface size * * Sets up the variable and fixed fbdev metainformation from the given fb helper * instance and the drm framebuffer allocated in &drm_fb_helper.fb. * * Drivers should call this (or their equivalent setup code) from their * &drm_fb_helper_funcs.fb_probe callback after having allocated the fbdev * backing storage framebuffer. */ void drm_fb_helper_fill_info(struct fb_info *info, struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_framebuffer *fb = fb_helper->fb; drm_fb_helper_fill_fix(info, fb->pitches[0], fb->format->is_color_indexed); drm_fb_helper_fill_var(info, fb_helper, sizes->fb_width, sizes->fb_height); info->par = fb_helper; /* * The DRM drivers fbdev emulation device name can be confusing if the * driver name also has a "drm" suffix on it. Leading to names such as * "simpledrmdrmfb" in /proc/fb. Unfortunately, it's an uAPI and can't * be changed due user-space tools (e.g: pm-utils) matching against it. */ snprintf(info->fix.id, sizeof(info->fix.id), "%sdrmfb", fb_helper->dev->driver->name); } EXPORT_SYMBOL(drm_fb_helper_fill_info); /* * This is a continuation of drm_setup_crtcs() that sets up anything related * to the framebuffer. During initialization, drm_setup_crtcs() is called before * the framebuffer has been allocated (fb_helper->fb and fb_helper->info). * So, any setup that touches those fields needs to be done here instead of in * drm_setup_crtcs(). */ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper) { struct drm_client_dev *client = &fb_helper->client; struct drm_connector_list_iter conn_iter; struct fb_info *info = fb_helper->info; unsigned int rotation, sw_rotations = 0; struct drm_connector *connector; struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_for_each_modeset(modeset, client) { if (!modeset->num_connectors) continue; modeset->fb = fb_helper->fb; if (drm_client_rotation(modeset, &rotation)) /* Rotating in hardware, fbcon should not rotate */ sw_rotations |= DRM_MODE_ROTATE_0; else sw_rotations |= rotation; } mutex_unlock(&client->modeset_mutex); drm_connector_list_iter_begin(fb_helper->dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { /* use first connected connector for the physical dimensions */ if (connector->status == connector_status_connected) { info->var.width = connector->display_info.width_mm; info->var.height = connector->display_info.height_mm; break; } } drm_connector_list_iter_end(&conn_iter); switch (sw_rotations) { case DRM_MODE_ROTATE_0: info->fbcon_rotate_hint = FB_ROTATE_UR; break; case DRM_MODE_ROTATE_90: info->fbcon_rotate_hint = FB_ROTATE_CCW; break; case DRM_MODE_ROTATE_180: info->fbcon_rotate_hint = FB_ROTATE_UD; break; case DRM_MODE_ROTATE_270: info->fbcon_rotate_hint = FB_ROTATE_CW; break; default: /* * Multiple bits are set / multiple rotations requested * fbcon cannot handle separate rotation settings per * output, so fallback to unrotated. */ info->fbcon_rotate_hint = FB_ROTATE_UR; } } /* Note: Drops fb_helper->lock before returning. */ static int __drm_fb_helper_initial_config_and_unlock(struct drm_fb_helper *fb_helper) { struct drm_device *dev = fb_helper->dev; struct fb_info *info; unsigned int width, height; int ret; width = dev->mode_config.max_width; height = dev->mode_config.max_height; drm_client_modeset_probe(&fb_helper->client, width, height); ret = drm_fb_helper_single_fb_probe(fb_helper); if (ret < 0) { if (ret == -EAGAIN) { fb_helper->deferred_setup = true; ret = 0; } mutex_unlock(&fb_helper->lock); return ret; } drm_setup_crtcs_fb(fb_helper); fb_helper->deferred_setup = false; info = fb_helper->info; info->var.pixclock = 0; /* Need to drop locks to avoid recursive deadlock in * register_framebuffer. This is ok because the only thing left to do is * register the fbdev emulation instance in kernel_fb_helper_list. */ mutex_unlock(&fb_helper->lock); ret = register_framebuffer(info); if (ret < 0) return ret; drm_info(dev, "fb%d: %s frame buffer device\n", info->node, info->fix.id); mutex_lock(&kernel_fb_helper_lock); if (list_empty(&kernel_fb_helper_list)) register_sysrq_key('v', &sysrq_drm_fb_helper_restore_op); list_add(&fb_helper->kernel_fb_list, &kernel_fb_helper_list); mutex_unlock(&kernel_fb_helper_lock); return 0; } /** * drm_fb_helper_initial_config - setup a sane initial connector configuration * @fb_helper: fb_helper device struct * * Scans the CRTCs and connectors and tries to put together an initial setup. * At the moment, this is a cloned configuration across all heads with * a new framebuffer object as the backing store. * * Note that this also registers the fbdev and so allows userspace to call into * the driver through the fbdev interfaces. * * This function will call down into the &drm_fb_helper_funcs.fb_probe callback * to let the driver allocate and initialize the fbdev info structure and the * drm framebuffer used to back the fbdev. drm_fb_helper_fill_info() is provided * as a helper to setup simple default values for the fbdev info structure. * * HANG DEBUGGING: * * When you have fbcon support built-in or already loaded, this function will do * a full modeset to setup the fbdev console. Due to locking misdesign in the * VT/fbdev subsystem that entire modeset sequence has to be done while holding * console_lock. Until console_unlock is called no dmesg lines will be sent out * to consoles, not even serial console. This means when your driver crashes, * you will see absolutely nothing else but a system stuck in this function, * with no further output. Any kind of printk() you place within your own driver * or in the drm core modeset code will also never show up. * * Standard debug practice is to run the fbcon setup without taking the * console_lock as a hack, to be able to see backtraces and crashes on the * serial line. This can be done by setting the fb.lockless_register_fb=1 kernel * cmdline option. * * The other option is to just disable fbdev emulation since very likely the * first modeset from userspace will crash in the same way, and is even easier * to debug. This can be done by setting the drm_kms_helper.fbdev_emulation=0 * kernel cmdline option. * * RETURNS: * Zero if everything went ok, nonzero otherwise. */ int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper) { int ret; if (!drm_fbdev_emulation) return 0; mutex_lock(&fb_helper->lock); ret = __drm_fb_helper_initial_config_and_unlock(fb_helper); return ret; } EXPORT_SYMBOL(drm_fb_helper_initial_config); /** * drm_fb_helper_hotplug_event - respond to a hotplug notification by * probing all the outputs attached to the fb * @fb_helper: driver-allocated fbdev helper, can be NULL * * Scan the connectors attached to the fb_helper and try to put together a * setup after notification of a change in output configuration. * * Called at runtime, takes the mode config locks to be able to check/change the * modeset configuration. Must be run from process context (which usually means * either the output polling work or a work item launched from the driver's * hotplug interrupt). * * Note that drivers may call this even before calling * drm_fb_helper_initial_config but only after drm_fb_helper_init. This allows * for a race-free fbcon setup and will make sure that the fbdev emulation will * not miss any hotplug events. * * RETURNS: * 0 on success and a non-zero error code otherwise. */ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) { int err = 0; if (!drm_fbdev_emulation || !fb_helper) return 0; mutex_lock(&fb_helper->lock); if (fb_helper->deferred_setup) { err = __drm_fb_helper_initial_config_and_unlock(fb_helper); return err; } if (!fb_helper->fb || !drm_master_internal_acquire(fb_helper->dev)) { fb_helper->delayed_hotplug = true; mutex_unlock(&fb_helper->lock); return err; } drm_master_internal_release(fb_helper->dev); drm_dbg_kms(fb_helper->dev, "\n"); drm_client_modeset_probe(&fb_helper->client, fb_helper->fb->width, fb_helper->fb->height); drm_setup_crtcs_fb(fb_helper); mutex_unlock(&fb_helper->lock); drm_fb_helper_set_par(fb_helper->info); return 0; } EXPORT_SYMBOL(drm_fb_helper_hotplug_event); /** * drm_fb_helper_lastclose - DRM driver lastclose helper for fbdev emulation * @dev: DRM device * * This function can be used as the &drm_driver->lastclose callback for drivers * that only need to call drm_fb_helper_restore_fbdev_mode_unlocked(). */ void drm_fb_helper_lastclose(struct drm_device *dev) { drm_fb_helper_restore_fbdev_mode_unlocked(dev->fb_helper); } EXPORT_SYMBOL(drm_fb_helper_lastclose); /** * drm_fb_helper_output_poll_changed - DRM mode config \.output_poll_changed * helper for fbdev emulation * @dev: DRM device * * This function can be used as the * &drm_mode_config_funcs.output_poll_changed callback for drivers that only * need to call drm_fbdev.hotplug_event(). */ void drm_fb_helper_output_poll_changed(struct drm_device *dev) { drm_fb_helper_hotplug_event(dev->fb_helper); } EXPORT_SYMBOL(drm_fb_helper_output_poll_changed);
44 57 99 18 18 18 10 10 1 10 2 2 1 10 18 129 74 134 121 121 80 80 11 5 119 120 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * mm/interval_tree.c - interval tree for mapping->i_mmap * * Copyright (C) 2012, Michel Lespinasse <walken@google.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/rmap.h> #include <linux/interval_tree_generic.h> static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) { return v->vm_pgoff; } static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; unsigned long last = vma_last_pgoff(node); VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); if (!prev->shared.rb.rb_right) { parent = prev; link = &prev->shared.rb.rb_right; } else { parent = rb_entry(prev->shared.rb.rb_right, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; while (parent->shared.rb.rb_left) { parent = rb_entry(parent->shared.rb.rb_left, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; } link = &parent->shared.rb.rb_left; } node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) { return vma_start_pgoff(avc->vma); } static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) { return vma_last_pgoff(avc->vma); } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, avc_start_pgoff, avc_last_pgoff, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); node->cached_vma_last = avc_last_pgoff(node); #endif __anon_vma_interval_tree_insert(node, root); } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); } struct anon_vma_chain * anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_next(node, first, last); } #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node) { WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); } #endif
19 18 18 18 18 18 17 17 18 18 13 13 13 13 12 11 9 11 11 11 11 10 10 11 11 3 19 19 16 18 11 11 11 18 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/readdir.c * * Copyright (C) 1995 Linus Torvalds */ #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/dirent.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <asm/unaligned.h> /* * Some filesystems were never converted to '->iterate_shared()' * and their directory iterators want the inode lock held for * writing. This wrapper allows for converting from the shared * semantics to the exclusive inode use. */ int wrap_directory_iterator(struct file *file, struct dir_context *ctx, int (*iter)(struct file *, struct dir_context *)) { struct inode *inode = file_inode(file); int ret; /* * We'd love to have an 'inode_upgrade_trylock()' operation, * see the comment in mmap_upgrade_trylock() in mm/memory.c. * * But considering this is for "filesystems that never got * converted", it really doesn't matter. * * Also note that since we have to return with the lock held * for reading, we can't use the "killable()" locking here, * since we do need to get the lock even if we're dying. * * We could do the write part killably and then get the read * lock unconditionally if it mattered, but see above on why * this does the very simplistic conversion. */ up_read(&inode->i_rwsem); down_write(&inode->i_rwsem); /* * Since we dropped the inode lock, we should do the * DEADDIR test again. See 'iterate_dir()' below. * * Note that we don't need to re-do the f_pos games, * since the file must be locked wrt f_pos anyway. */ ret = -ENOENT; if (!IS_DEADDIR(inode)) ret = iter(file, ctx); downgrade_write(&inode->i_rwsem); return ret; } EXPORT_SYMBOL(wrap_directory_iterator); /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ unsafe_put_user(0, dst+len, label); \ unsafe_copy_to_user(dst, src, len, label); \ } while (0) int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; res = fsnotify_file_perm(file, MAY_READ); if (res) goto out; res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } inode_unlock_shared(inode); out: return res; } EXPORT_SYMBOL(iterate_dir); /* * POSIX says that a dirent name cannot contain NULL or a '/'. * * It's not 100% clear what we should really do in this case. * The filesystem is clearly corrupted, but returning a hard * error means that you now don't see any of the other names * either, so that isn't a perfect alternative. * * And if you return an error, what error do you use? Several * filesystems seem to have decided on EUCLEAN being the error * code for EFSCORRUPTED, and that may be the error to use. Or * just EIO, which is perhaps more obvious to users. * * In order to see the other file names in the directory, the * caller might want to make this a "soft" error: skip the * entry, and return the error at the end instead. * * Note that this should likely do a "memchr(name, 0, len)" * check too, since that would be filesystem corruption as * well. However, that case can't actually confuse user space, * which has to do a strlen() on the name anyway to find the * filename length, and the above "soft error" worry means * that it's probably better left alone until we have that * issue clarified. * * Note the PATH_MAX check - it's arbitrary but the real * kernel limit on a possible path component, not NAME_MAX, * which is the technical standard limit. */ static int verify_dirent_name(const char *name, int len) { if (len <= 0 || len >= PATH_MAX) return -EIO; if (memchr(name, '/', len)) return -EIO; return 0; } /* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one * dirent-structure in size and that the code can't handle more * anyway. Thus the special "fillonedir()" function for that * case (the low-level handlers don't need to care about this). */ #ifdef __ARCH_WANT_OLD_READDIR struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; char d_name[]; }; struct readdir_callback { struct dir_context ctx; struct old_linux_dirent __user * dirent; int result; }; static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = container_of(ctx, struct readdir_callback, ctx); struct old_linux_dirent __user * dirent; unsigned long d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(offset, &dirent->d_offset, efault_end); unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct fd f = fdget_pos(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .dirent = dirent }; if (!f.file) return -EBADF; error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; fdput_pos(f); return error; } #endif /* __ARCH_WANT_OLD_READDIR */ /* * New, all-improved, singing, dancing, iBCS2-compliant getdents() * interface. */ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; char d_name[]; }; struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; int prev_reclen; int count; int error; }; static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { struct fd f; struct getdents_callback buf = { .ctx.actor = filldir, .count = count, .current_dir = dirent }; int error; f = fdget_pos(fd); if (!f.file) return -EBADF; error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } fdput_pos(f); return error; } struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; int prev_reclen; int count; int error; }; static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; struct getdents_callback64 *buf = container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { struct fd f; struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, .current_dir = dirent }; int error; f = fdget_pos(fd); if (!f.file) return -EBADF; error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent64 __user * lastdirent; typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } fdput_pos(f); return error; } #ifdef CONFIG_COMPAT struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; char d_name[]; }; struct compat_readdir_callback { struct dir_context ctx; struct compat_old_linux_dirent __user *dirent; int result; }; static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_readdir_callback *buf = container_of(ctx, struct compat_readdir_callback, ctx); struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(offset, &dirent->d_offset, efault_end); unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct fd f = fdget_pos(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .dirent = dirent }; if (!f.file) return -EBADF; error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; fdput_pos(f); return error; } struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; char d_name[]; }; struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; int prev_reclen; int count; int error; }; static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, .count = count }; int error; f = fdget_pos(fd); if (!f.file) return -EBADF; error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct compat_linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } fdput_pos(f); return error; } #endif
61 47 50 73 14 75 61 61 60 59 1 58 24 24 23 29 17 29 9 29 9 6 3 2 1 3 11 11 1 10 10 11 12 7 12 11 9 9 1 2 2 2 12 13 12 13 12 1 12 15 7 4 2 13 12 14 3 2 3 3 1 1 11 11 10 6 5 4 2 2 2 2 54 53 8 45 54 5 5 5 5 5 5 1 3 1 1 1 6 6 6 1 5 6 5 5 4 4 1 4 1 1 5 5 5 21 21 21 21 19 13 13 20 19 15 21 20 20 13 13 12 12 12 6 8 10 8 9 9 9 9 6 3 1 3 7 2 7 7 7 5 7 7 7 6 3 3 3 1 2 5 3 4 3 1 1 1 1 1 1 2 1 1 1 8 8 8 3 8 5 5 5 4 5 4 1 3 3 3 3 3 3 43 5 8 4 8 7 7 7 7 38 4 4 3 2 2 38 8 7 8 1 4 4 1 4 4 4 2 8 7 7 1 8 1 1 1 1 1 1 55 5 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /** * may_write_xattr - check whether inode allows writing xattr * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never * set or remove an extended attribute on a read-only filesystem or on an * immutable / append-only inode. * * We also need to ensure that the inode has a mapping in the mount to * not risk writing back invalid i_{g,u}id values. * * Return: On success zero is returned. On error a negative errno is returned. */ int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; ret = may_write_xattr(idmap, inode); if (ret) return ret; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(idmap, inode)) return -EPERM; } return inode_permission(idmap, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. The xattr value buffer should * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); /** * vfs_listxattr - retrieve \0 separated list of xattr names * @dentry: the dentry from whose inode the xattr names are retrieved * @list: buffer to store xattr names into * @size: size of the buffer * * This function returns the names of all xattrs associated with the * inode of @dentry. * * Note, for legacy reasons the vfs_listxattr() function lists POSIX * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the * vfs_listxattr() function doesn't check for this flag since a * filesystem could implement POSIX ACLs without implementing any other * xattrs. * * However, since all codepaths that remove IOP_XATTR also assign of * inode operations that either don't implement or implement a stub * ->listxattr() operation. * * Return: On success, the size of the buffer that was used. On error a * negative error code. */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(idmap, dentry, name); if (error) return error; fsnotify_xattr(dentry); security_inode_post_removexattr(dentry, name); out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = strncpy_from_user(ctx->kname->name, name, sizeof(ctx->kname->name)); if (error == 0 || error == sizeof(ctx->kname->name)) return -ERANGE; if (error < 0) return error; error = 0; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } static long setxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; int error; error = setxattr_copy(name, &ctx); if (error) return error; error = do_setxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; } static int path_setxattr(const char __user *pathname, const char __user *name, const void __user *value, size_t size, int flags, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = setxattr(mnt_idmap(path.mnt), path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, 0); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = setxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); } fdput(f); return error; } /* * Extended attribute GET operations */ ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); if (!ctx->kvalue) return -ENOMEM; } if (is_posix_acl_xattr(ctx->kname->name)) error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); else error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } return error; } static ssize_t getxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; struct xattr_name kname; struct xattr_ctx ctx = { .value = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = 0, }; error = strncpy_from_user(kname.name, name, sizeof(kname.name)); if (error == 0 || error == sizeof(kname.name)) error = -ERANGE; if (error < 0) return error; error = do_getxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; } static ssize_t path_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, 0); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size); fdput(f); return error; } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t path_listxattr(const char __user *pathname, char __user *list, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, 0); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = listxattr(f.file->f_path.dentry, list, size); fdput(f); return error; } /* * Extended attribute REMOVE operations */ static long removexattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; if (is_posix_acl_xattr(kname)) return vfs_remove_acl(idmap, d, kname); return vfs_removexattr(idmap, d, kname); } static int path_removexattr(const char __user *pathname, const char __user *name, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_idmap(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, LOOKUP_FOLLOW); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, 0); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = removexattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); return error; } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len; len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /** * generic_listxattr - run through a dentry's xattr list() operations * @dentry: dentry to list the xattrs * @buffer: result buffer * @buffer_size: size of @buffer * * Combine the results of the list() operation from every xattr_handler in the * xattr_handler stack. * * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; int err = 0; for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); if (err) return err; } return err ? err : buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /** * simple_xattr_space - estimate the memory used by a simple xattr * @name: the full name of the xattr * @size: the size of its value * * This takes no account of how much larger the two slab objects actually are: * that would depend on the slab implementation, when what is required is a * deterministic number, which grows with name length and size and quantity. * * Return: The approximate number of bytes of memory used by such an xattr. */ size_t simple_xattr_space(const char *name, size_t size) { /* * Use "40" instead of sizeof(struct simple_xattr), to return the * same result on 32-bit and 64-bit, and even if simple_xattr grows. */ return 40 + size + strlen(name); } /** * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); kvfree(xattr); } /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object * @size: size of @value * * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * * Return: On success a new xattr object is returned. On failure NULL is * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /** * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry * @key: xattr name * @node: current node * * Compare the xattr name with the xattr name attached to @node in the rbtree. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @node matches @key. */ static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) { const char *xattr_name = key; const struct simple_xattr *xattr; xattr = rb_entry(node, struct simple_xattr, rb_node); return strcmp(xattr->name, xattr_name); } /** * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @new_node matches the xattr attached to @node. */ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, const struct rb_node *node) { struct simple_xattr *xattr; xattr = rb_entry(new_node, struct simple_xattr, rb_node); return rbtree_simple_xattr_cmp(xattr->name, node); } /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into * @size: the size of @buffer * * Try to find and retrieve the xattr object associated with @name. * If @buffer is provided store the value of @xattr in @buffer * otherwise just return the length. The size of @buffer is limited * to XATTR_SIZE_MAX which currently is 65536. * * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; struct rb_node *rbp; int ret = -ENODATA; read_lock(&xattrs->lock); rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); if (rbp) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } } read_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE * is specified in @flags a matching xattr object for @name must already exist. * If it does it will be replaced with the new xattr object. If it doesn't we * fail. If XATTR_CREATE is specified and a matching xattr does already exist * we fail. If it doesn't we create a new xattr. If @flags is zero we simply * insert the new xattr replacing any existing one. * * If @value is empty and a matching xattr object is found we delete it if * XATTR_REPLACE is specified in @flags or @flags is zero. * * If @value is empty and no matching xattr object for @name is found we do * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); } } write_lock(&xattrs->lock); rbp = &xattrs->rb_root.rb_node; while (*rbp) { parent = *rbp; ret = rbtree_simple_xattr_cmp(name, *rbp); if (ret < 0) rbp = &(*rbp)->rb_left; else if (ret > 0) rbp = &(*rbp)->rb_right; else old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); if (old_xattr) break; } if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; goto out_unlock; } if (new_xattr) rb_replace_node(&old_xattr->rb_node, &new_xattr->rb_node, &xattrs->rb_root); else rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { err = -ENODATA; goto out_unlock; } /* * If XATTR_CREATE or no flags are specified together with a * new value simply insert it. */ if (new_xattr) { rb_link_node(&new_xattr->rb_node, parent, rbp); rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); } /* * If XATTR_CREATE or no flags are specified and neither an * old or new xattr exist then we don't need to do anything. */ } out_unlock: write_unlock(&xattrs->lock); if (!err) return old_xattr; simple_xattr_free(new_xattr); return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs * @xattrs: the header of the xattr object * @buffer: the buffer to store all xattrs into * @size: the size of @buffer * * List all xattrs associated with @inode. If @buffer is NULL we returned * the required size of the buffer. If @buffer is provided we store the * xattrs value into it provided it is big enough. * * Note, the number of xattr names that can be listed with listxattr(2) is * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names * are found it will return -E2BIG. * * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; err = posix_acl_listxattr(inode, &buffer, &remaining_size); if (err) return err; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /** * rbtree_simple_xattr_less - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * Note that this function technically tolerates duplicate entries. * * Return: True if insertion point in the rbtree is found. */ static bool rbtree_simple_xattr_less(struct rb_node *new_node, const struct rb_node *node) { return rbtree_simple_xattr_node_cmp(new_node, node) < 0; } /** * simple_xattr_add - add xattr objects * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { write_lock(&xattrs->lock); rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); write_unlock(&xattrs->lock); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * * Initialize relevant fields of a an xattr header. */ void simple_xattrs_init(struct simple_xattrs *xattrs) { xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; if (freed_space) *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; struct rb_node *rbp_next; rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); if (freed_space) *freed_space += simple_xattr_space(xattr->name, xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } }
1 1 1 1 1 1 1 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code partly funded by OISF * (http://www.openinfosecfoundation.org/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_queue.h> static u32 jhash_initval __read_mostly; struct nft_queue { u8 sreg_qnum; u16 queuenum; u16 queues_total; u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_queue *priv = nft_expr_priv(expr); u32 queue = priv->queuenum; u32 ret; if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, priv->queues_total, nft_pf(pkt), jhash_initval); } } ret = NF_QUEUE_NR(queue); if (priv->flags & NFT_QUEUE_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; regs->verdict.code = ret; } static void nft_queue_sreg_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_queue *priv = nft_expr_priv(expr); u32 queue, ret; queue = regs->data[priv->sreg_qnum]; ret = NF_QUEUE_NR(queue); if (priv->flags & NFT_QUEUE_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; regs->verdict.code = ret; } static int nft_queue_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: case NFPROTO_BRIDGE: break; case NFPROTO_NETDEV: /* lacks okfn */ fallthrough; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, supported_hooks); } static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 }, }; static int nft_queue_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); u32 maxid; priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); if (tb[NFTA_QUEUE_TOTAL]) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); else priv->queues_total = 1; if (priv->queues_total == 0) return -EINVAL; maxid = priv->queues_total - 1 + priv->queuenum; if (maxid > U16_MAX) return -ERANGE; if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; } return 0; } static int nft_queue_sreg_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); int err; err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) return -EOPNOTSUPP; } return 0; } static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) || nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_queue_type; static const struct nft_expr_ops nft_queue_ops = { .type = &nft_queue_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), .eval = nft_queue_eval, .init = nft_queue_init, .dump = nft_queue_dump, .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_queue_sreg_ops = { .type = &nft_queue_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), .eval = nft_queue_sreg_eval, .init = nft_queue_sreg_init, .dump = nft_queue_sreg_dump, .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_queue_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM]) return ERR_PTR(-EINVAL); init_hashrandom(&jhash_initval); if (tb[NFTA_QUEUE_NUM]) return &nft_queue_ops; if (tb[NFTA_QUEUE_SREG_QNUM]) return &nft_queue_sreg_ops; return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", .select_ops = nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, }; static int __init nft_queue_module_init(void) { return nft_register_expr(&nft_queue_type); } static void __exit nft_queue_module_exit(void) { nft_unregister_expr(&nft_queue_type); } module_init(nft_queue_module_init); module_exit(nft_queue_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); MODULE_ALIAS_NFT_EXPR("queue"); MODULE_DESCRIPTION("Netfilter nftables queue module");
3 3 2 2 2 2 1 4 3 4 3 1 4 3 1 3 2 4 4 4 4 2 4 2 4 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/sched/mm.h> #include <keys/ceph-type.h> #include <keys/user-type.h> #include <linux/ceph/decode.h> #include "crypto.h" /* * Set ->key and ->tfm. The rest of the key should be filled in before * this function is called. */ static int set_secret(struct ceph_crypto_key *key, void *buf) { unsigned int noio_flag; int ret; key->key = NULL; key->tfm = NULL; switch (key->type) { case CEPH_CRYPTO_NONE: return 0; /* nothing to do */ case CEPH_CRYPTO_AES: break; default: return -ENOTSUPP; } if (!key->len) return -EINVAL; key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; goto fail; } /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); if (IS_ERR(key->tfm)) { ret = PTR_ERR(key->tfm); key->tfm = NULL; goto fail; } ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); if (ret) goto fail; return 0; fail: ceph_crypto_key_destroy(key); return ret; } int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src) { memcpy(dst, src, sizeof(struct ceph_crypto_key)); return set_secret(dst, src->key); } int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) { if (*p + sizeof(u16) + sizeof(key->created) + sizeof(u16) + key->len > end) return -ERANGE; ceph_encode_16(p, key->type); ceph_encode_copy(p, &key->created, sizeof(key->created)); ceph_encode_16(p, key->len); ceph_encode_copy(p, key->key, key->len); return 0; } int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) { int ret; ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); key->type = ceph_decode_16(p); ceph_decode_copy(p, &key->created, sizeof(key->created)); key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); ret = set_secret(key, *p); memzero_explicit(*p, key->len); *p += key->len; return ret; bad: dout("failed to decode crypto key\n"); return -EINVAL; } int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) { int inlen = strlen(inkey); int blen = inlen * 3 / 4; void *buf, *p; int ret; dout("crypto_key_unarmor %s\n", inkey); buf = kmalloc(blen, GFP_NOFS); if (!buf) return -ENOMEM; blen = ceph_unarmor(buf, inkey, inkey+inlen); if (blen < 0) { kfree(buf); return blen; } p = buf; ret = ceph_crypto_key_decode(key, &p, p + blen); kfree(buf); if (ret) return ret; dout("crypto_key_unarmor key %p type %d len %d\n", key, key->type, key->len); return 0; } void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { if (key) { kfree_sensitive(key->key); key->key = NULL; if (key->tfm) { crypto_free_sync_skcipher(key->tfm); key->tfm = NULL; } } } static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * * Dispose of @sgt with teardown_sgtable(). * * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() * in cases where a single sg is sufficient. No attempt to reduce the * number of sgs by squeezing physically contiguous pages together is * made though, for simplicity. */ static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, const void *buf, unsigned int buf_len) { struct scatterlist *sg; const bool is_vmalloc = is_vmalloc_addr(buf); unsigned int off = offset_in_page(buf); unsigned int chunk_cnt = 1; unsigned int chunk_len = PAGE_ALIGN(off + buf_len); int i; int ret; if (buf_len == 0) { memset(sgt, 0, sizeof(*sgt)); return -EINVAL; } if (is_vmalloc) { chunk_cnt = chunk_len >> PAGE_SHIFT; chunk_len = PAGE_SIZE; } if (chunk_cnt > 1) { ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); if (ret) return ret; } else { WARN_ON(chunk_cnt != 1); sg_init_table(prealloc_sg, 1); sgt->sgl = prealloc_sg; sgt->nents = sgt->orig_nents = 1; } for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { struct page *page; unsigned int len = min(chunk_len - off, buf_len); if (is_vmalloc) page = vmalloc_to_page(buf); else page = virt_to_page(buf); sg_set_page(sg, page, len, off); off = 0; buf += len; buf_len -= len; } WARN_ON(buf_len != 0); return 0; } static void teardown_sgtable(struct sg_table *sgt) { if (sgt->orig_nents > 1) sg_free_table(sgt); } static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1)); int crypt_len = encrypt ? in_len + pad_byte : in_len; int ret; WARN_ON(crypt_len > buf_len); if (encrypt) memset(buf + in_len, pad_byte, pad_byte); ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len); if (ret) return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); skcipher_request_set_sync_tfm(req, key->tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); /* print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1, key->key, key->len, 1); print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1, buf, crypt_len, 1); */ if (encrypt) ret = crypto_skcipher_encrypt(req); else ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (ret) { pr_err("%s %scrypt failed: %d\n", __func__, encrypt ? "en" : "de", ret); goto out_sgt; } /* print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1, buf, crypt_len, 1); */ if (encrypt) { *pout_len = crypt_len; } else { pad_byte = *(char *)(buf + in_len - 1); if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE && in_len >= pad_byte) { *pout_len = in_len - pad_byte; } else { pr_err("%s got bad padding %d on in_len %d\n", __func__, pad_byte, in_len); ret = -EPERM; goto out_sgt; } } out_sgt: teardown_sgtable(&sgt); return ret; } int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { switch (key->type) { case CEPH_CRYPTO_NONE: *pout_len = in_len; return 0; case CEPH_CRYPTO_AES: return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len, pout_len); default: return -ENOTSUPP; } } static int ceph_key_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; size_t datalen = prep->datalen; int ret; void *p; ret = -EINVAL; if (datalen <= 0 || datalen > 32767 || !prep->data) goto err; ret = -ENOMEM; ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); if (!ckey) goto err; /* TODO ceph_crypto_key_decode should really take const input */ p = (void *)prep->data; ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); if (ret < 0) goto err_ckey; prep->payload.data[0] = ckey; prep->quotalen = datalen; return 0; err_ckey: kfree(ckey); err: return ret; } static void ceph_key_free_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey = prep->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); } static void ceph_key_destroy(struct key *key) { struct ceph_crypto_key *ckey = key->payload.data[0]; ceph_crypto_key_destroy(ckey); kfree(ckey); } struct key_type key_type_ceph = { .name = "ceph", .preparse = ceph_key_preparse, .free_preparse = ceph_key_free_preparse, .instantiate = generic_key_instantiate, .destroy = ceph_key_destroy, }; int __init ceph_crypto_init(void) { return register_key_type(&key_type_ceph); } void ceph_crypto_shutdown(void) { unregister_key_type(&key_type_ceph); }
19 19 19 5 5 19 1 19 18 2 2 2 2 19 19 19 15 4 19 4 15 19 1 13 13 13 7 1 8 8 7 7 7 5 5 5 5 5 5 9 7 3 9 1 9 2 2 2 9 9 4 4 48 17 6 35 16 12 12 34 20 2 12 13 8 10 1 47 47 8 8 8 45 42 60 60 7 7 7 5 5 5 13 13 13 8 9 8 6 2 8 8 2 1 8 8 8 4 2 6 46 41 41 42 42 42 40 55 56 53 4 4 53 54 54 60 60 3 3 20 9 9 7 7 2 1 2 1 1 1 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
542 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif
1 1 1 1 1 1 7 15 5 7 8 3 5 7 1 2 5 5 5 5 5 5 5 4 19 11 19 11 11 11 11 11 14 14 14 14 14 14 15 14 7 3 3 3 4 4 4 4 4 3 3 7 7 2 2 2 2 2 25 5 4 4 3 3 3 3 2 3 2 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 13 13 12 12 4 4 4 4 4 4 4 4 9 1 1 1 1 3 3 2 4 1 3 4 3 3 3 3 3 3 2 3 3 3 3 2 8 6 10 10 10 3 3 3 2 7 7 7 7 1 6 7 5 1 1 1 1 1 16 3 1 3 13 13 5 6 5 2 2 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 2 3 2 2 1 2 2 2 1 1 1 1 1 2 2 2 2 3 7 7 1 1 1 1 1 1 13 13 2 9 7 2 9 7 9 7 6 6 6 5 5 5 5 5 3 3 5 4 4 4 6 1 1 1 5 4 3 3 3 3 3 3 7 6 1 5 13 10 12 6 3 3 3 3 1 1 1 1 1 1 7 1 1 2 1 5 3 12 13 1 3 3 3 3 3 21 2 20 19 6 2 2 2 2 2 2 1 1 1 20 1 1 1 1 1 1 1 16 16 1 7 7 7 7 6 6 6 6 6 1 5 4 5 4 4 2 2 2 2 2 1 5 3 1 1 1 1 1 1 1 1 1 1 1 1 1 10 6 4 2 2 1 1 5 5 10 5 5 5 5 5 1 4 1 4 3 1 1 2 2 2 3 3 3 2 9 9 9 4 8 4 5 4 9 13 13 13 13 13 2 2 2 2 2 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Copyright (C) 2006 Rusty Russell IBM Corporation * * Author: Michael S. Tsirkin <mst@redhat.com> * * Inspiration, some code, and most witty comments come from * Documentation/virtual/lguest/lguest.c, by Rusty Russell * * Generic code for virtio server in host kernel. */ #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/uio.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/sort.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/vhost_task.h> #include <linux/interval_tree_generic.h> #include <linux/nospec.h> #include <linux/kcov.h> #include "vhost.h" static ushort max_mem_regions = 64; module_param(max_mem_regions, ushort, 0444); MODULE_PARM_DESC(max_mem_regions, "Maximum number of memory regions in memory map. (default: 64)"); static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries. (default: 2048)"); enum { VHOST_MEMORY_F_LOG = 0x1, }; #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) { vq->user_be = !virtio_legacy_is_little_endian(); } static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq) { vq->user_be = true; } static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq) { vq->user_be = false; } static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) { struct vhost_vring_state s; if (vq->private_data) return -EBUSY; if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; if (s.num != VHOST_VRING_LITTLE_ENDIAN && s.num != VHOST_VRING_BIG_ENDIAN) return -EINVAL; if (s.num == VHOST_VRING_BIG_ENDIAN) vhost_enable_cross_endian_big(vq); else vhost_enable_cross_endian_little(vq); return 0; } static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, int __user *argp) { struct vhost_vring_state s = { .index = idx, .num = vq->user_be }; if (copy_to_user(argp, &s, sizeof(s))) return -EFAULT; return 0; } static void vhost_init_is_le(struct vhost_virtqueue *vq) { /* Note for legacy virtio: user_be is initialized at reset time * according to the host endianness. If userspace does not set an * explicit endianness, the default behavior is native endian, as * expected by legacy virtio. */ vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be; } #else static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) { } static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) { return -ENOIOCTLCMD; } static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, int __user *argp) { return -ENOIOCTLCMD; } static void vhost_init_is_le(struct vhost_virtqueue *vq) { vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || virtio_legacy_is_little_endian(); } #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */ static void vhost_reset_is_le(struct vhost_virtqueue *vq) { vhost_init_is_le(vq); } struct vhost_flush_struct { struct vhost_work work; struct completion wait_event; }; static void vhost_flush_work(struct vhost_work *work) { struct vhost_flush_struct *s; s = container_of(work, struct vhost_flush_struct, work); complete(&s->wait_event); } static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct vhost_poll *poll; poll = container_of(pt, struct vhost_poll, table); poll->wqh = wqh; add_wait_queue(wqh, &poll->wait); } static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); struct vhost_work *work = &poll->work; if (!(key_to_poll(key) & poll->mask)) return 0; if (!poll->dev->use_worker) work->fn(work); else vhost_poll_queue(poll); return 0; } void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) { clear_bit(VHOST_WORK_QUEUED, &work->flags); work->fn = fn; } EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev, struct vhost_virtqueue *vq) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; poll->dev = dev; poll->wqh = NULL; poll->vq = vq; vhost_work_init(&poll->work, fn); } EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ int vhost_poll_start(struct vhost_poll *poll, struct file *file) { __poll_t mask; if (poll->wqh) return 0; mask = vfs_poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask)); if (mask & EPOLLERR) { vhost_poll_stop(poll); return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ void vhost_poll_stop(struct vhost_poll *poll) { if (poll->wqh) { remove_wait_queue(poll->wqh, &poll->wait); poll->wqh = NULL; } } EXPORT_SYMBOL_GPL(vhost_poll_stop); static void vhost_worker_queue(struct vhost_worker *worker, struct vhost_work *work) { if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { /* We can only add the work to the list after we're * sure it was not in the list. * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &worker->work_list); vhost_task_wake(worker->vtsk); } } bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work) { struct vhost_worker *worker; bool queued = false; rcu_read_lock(); worker = rcu_dereference(vq->worker); if (worker) { queued = true; vhost_worker_queue(worker, work); } rcu_read_unlock(); return queued; } EXPORT_SYMBOL_GPL(vhost_vq_work_queue); /** * __vhost_worker_flush - flush a worker * @worker: worker to flush * * The worker's flush_mutex must be held. */ static void __vhost_worker_flush(struct vhost_worker *worker) { struct vhost_flush_struct flush; if (!worker->attachment_cnt || worker->killed) return; init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); vhost_worker_queue(worker, &flush.work); /* * Drop mutex in case our worker is killed and it needs to take the * mutex to force cleanup. */ mutex_unlock(&worker->mutex); wait_for_completion(&flush.wait_event); mutex_lock(&worker->mutex); } static void vhost_worker_flush(struct vhost_worker *worker) { mutex_lock(&worker->mutex); __vhost_worker_flush(worker); mutex_unlock(&worker->mutex); } void vhost_dev_flush(struct vhost_dev *dev) { struct vhost_worker *worker; unsigned long i; xa_for_each(&dev->worker_xa, i, worker) vhost_worker_flush(worker); } EXPORT_SYMBOL_GPL(vhost_dev_flush); /* A lockless hint for busy polling code to exit the loop */ bool vhost_vq_has_work(struct vhost_virtqueue *vq) { struct vhost_worker *worker; bool has_work = false; rcu_read_lock(); worker = rcu_dereference(vq->worker); if (worker && !llist_empty(&worker->work_list)) has_work = true; rcu_read_unlock(); return has_work; } EXPORT_SYMBOL_GPL(vhost_vq_has_work); void vhost_poll_queue(struct vhost_poll *poll) { vhost_vq_work_queue(poll->vq, &poll->work); } EXPORT_SYMBOL_GPL(vhost_poll_queue); static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq) { int j; for (j = 0; j < VHOST_NUM_ADDRS; j++) vq->meta_iotlb[j] = NULL; } static void vhost_vq_meta_reset(struct vhost_dev *d) { int i; for (i = 0; i < d->nvqs; ++i) __vhost_vq_meta_reset(d->vqs[i]); } static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx) { call_ctx->ctx = NULL; memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer)); } bool vhost_vq_is_setup(struct vhost_virtqueue *vq) { return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq); } EXPORT_SYMBOL_GPL(vhost_vq_is_setup); static void vhost_vq_reset(struct vhost_dev *dev, struct vhost_virtqueue *vq) { vq->num = 1; vq->desc = NULL; vq->avail = NULL; vq->used = NULL; vq->last_avail_idx = 0; vq->avail_idx = 0; vq->last_used_idx = 0; vq->signalled_used = 0; vq->signalled_used_valid = false; vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; vq->private_data = NULL; vq->acked_features = 0; vq->acked_backend_features = 0; vq->log_base = NULL; vq->error_ctx = NULL; vq->kick = NULL; vq->log_ctx = NULL; vhost_disable_cross_endian(vq); vhost_reset_is_le(vq); vq->busyloop_timeout = 0; vq->umem = NULL; vq->iotlb = NULL; rcu_assign_pointer(vq->worker, NULL); vhost_vring_call_reset(&vq->call_ctx); __vhost_vq_meta_reset(vq); } static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; struct vhost_work *work, *work_next; struct llist_node *node; node = llist_del_all(&worker->work_list); if (node) { __set_current_state(TASK_RUNNING); node = llist_reverse_order(node); /* make sure flag is seen after deletion */ smp_wmb(); llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); kcov_remote_start_common(worker->kcov_handle); work->fn(work); kcov_remote_stop(); cond_resched(); } } return !!node; } static void vhost_worker_killed(void *data) { struct vhost_worker *worker = data; struct vhost_dev *dev = worker->dev; struct vhost_virtqueue *vq; int i, attach_cnt = 0; mutex_lock(&worker->mutex); worker->killed = true; for (i = 0; i < dev->nvqs; i++) { vq = dev->vqs[i]; mutex_lock(&vq->mutex); if (worker == rcu_dereference_check(vq->worker, lockdep_is_held(&vq->mutex))) { rcu_assign_pointer(vq->worker, NULL); attach_cnt++; } mutex_unlock(&vq->mutex); } worker->attachment_cnt -= attach_cnt; if (attach_cnt) synchronize_rcu(); /* * Finish vhost_worker_flush calls and any other works that snuck in * before the synchronize_rcu. */ vhost_run_work_list(worker); mutex_unlock(&worker->mutex); } static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) { kfree(vq->indirect); vq->indirect = NULL; kfree(vq->log); vq->log = NULL; kfree(vq->heads); vq->heads = NULL; } /* Helper to allocate iovec buffers for all vqs. */ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { struct vhost_virtqueue *vq; int i; for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; vq->indirect = kmalloc_array(UIO_MAXIOV, sizeof(*vq->indirect), GFP_KERNEL); vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log), GFP_KERNEL); vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads), GFP_KERNEL); if (!vq->indirect || !vq->log || !vq->heads) goto err_nomem; } return 0; err_nomem: for (; i >= 0; --i) vhost_vq_free_iovecs(dev->vqs[i]); return -ENOMEM; } static void vhost_dev_free_iovecs(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) vhost_vq_free_iovecs(dev->vqs[i]); } bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len) { struct vhost_dev *dev = vq->dev; if ((dev->byte_weight && total_len >= dev->byte_weight) || pkts >= dev->weight) { vhost_poll_queue(&vq->poll); return true; } return false; } EXPORT_SYMBOL_GPL(vhost_exceeds_weight); static size_t vhost_get_avail_size(struct vhost_virtqueue *vq, unsigned int num) { size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; return size_add(struct_size(vq->avail, ring, num), event); } static size_t vhost_get_used_size(struct vhost_virtqueue *vq, unsigned int num) { size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; return size_add(struct_size(vq->used, ring, num), event); } static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, unsigned int num) { return sizeof(*vq->desc) * num; } void vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs, int iov_limit, int weight, int byte_weight, bool use_worker, int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg)) { struct vhost_virtqueue *vq; int i; dev->vqs = vqs; dev->nvqs = nvqs; mutex_init(&dev->mutex); dev->log_ctx = NULL; dev->umem = NULL; dev->iotlb = NULL; dev->mm = NULL; dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); spin_lock_init(&dev->iotlb_lock); xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC); for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; vq->log = NULL; vq->indirect = NULL; vq->heads = NULL; vq->dev = dev; mutex_init(&vq->mutex); vhost_vq_reset(dev, vq); if (vq->handle_kick) vhost_poll_init(&vq->poll, vq->handle_kick, EPOLLIN, dev, vq); } } EXPORT_SYMBOL_GPL(vhost_dev_init); /* Caller should have device mutex */ long vhost_dev_check_owner(struct vhost_dev *dev) { /* Are you the owner? If not, I don't think you mean to do that */ return dev->mm == current->mm ? 0 : -EPERM; } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { return dev->mm; } EXPORT_SYMBOL_GPL(vhost_dev_has_owner); static void vhost_attach_mm(struct vhost_dev *dev) { /* No owner, become one */ if (dev->use_worker) { dev->mm = get_task_mm(current); } else { /* vDPA device does not use worker thead, so there's * no need to hold the address space for mm. This help * to avoid deadlock in the case of mmap() which may * held the refcnt of the file and depends on release * method to remove vma. */ dev->mm = current->mm; mmgrab(dev->mm); } } static void vhost_detach_mm(struct vhost_dev *dev) { if (!dev->mm) return; if (dev->use_worker) mmput(dev->mm); else mmdrop(dev->mm); dev->mm = NULL; } static void vhost_worker_destroy(struct vhost_dev *dev, struct vhost_worker *worker) { if (!worker) return; WARN_ON(!llist_empty(&worker->work_list)); xa_erase(&dev->worker_xa, worker->id); vhost_task_stop(worker->vtsk); kfree(worker); } static void vhost_workers_free(struct vhost_dev *dev) { struct vhost_worker *worker; unsigned long i; if (!dev->use_worker) return; for (i = 0; i < dev->nvqs; i++) rcu_assign_pointer(dev->vqs[i]->worker, NULL); /* * Free the default worker we created and cleanup workers userspace * created but couldn't clean up (it forgot or crashed). */ xa_for_each(&dev->worker_xa, i, worker) vhost_worker_destroy(dev, worker); xa_destroy(&dev->worker_xa); } static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) { struct vhost_worker *worker; struct vhost_task *vtsk; char name[TASK_COMM_LEN]; int ret; u32 id; worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); if (!worker) return NULL; worker->dev = dev; snprintf(name, sizeof(name), "vhost-%d", current->pid); vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, worker, name); if (!vtsk) goto free_worker; mutex_init(&worker->mutex); init_llist_head(&worker->work_list); worker->kcov_handle = kcov_common_handle(); worker->vtsk = vtsk; vhost_task_start(vtsk); ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); if (ret < 0) goto stop_worker; worker->id = id; return worker; stop_worker: vhost_task_stop(vtsk); free_worker: kfree(worker); return NULL; } /* Caller must have device mutex */ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_worker *worker) { struct vhost_worker *old_worker; mutex_lock(&worker->mutex); if (worker->killed) { mutex_unlock(&worker->mutex); return; } mutex_lock(&vq->mutex); old_worker = rcu_dereference_check(vq->worker, lockdep_is_held(&vq->mutex)); rcu_assign_pointer(vq->worker, worker); worker->attachment_cnt++; if (!old_worker) { mutex_unlock(&vq->mutex); mutex_unlock(&worker->mutex); return; } mutex_unlock(&vq->mutex); mutex_unlock(&worker->mutex); /* * Take the worker mutex to make sure we see the work queued from * device wide flushes which doesn't use RCU for execution. */ mutex_lock(&old_worker->mutex); if (old_worker->killed) { mutex_unlock(&old_worker->mutex); return; } /* * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done * VHOST_SET_VRING_KICK and not done the driver specific * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will * not be any works queued for scsi and net. */ mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq) && !vq->kick) { mutex_unlock(&vq->mutex); old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); /* * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID. * Warn if it adds support for multiple workers but forgets to * handle the early queueing case. */ WARN_ON(!old_worker->attachment_cnt && !llist_empty(&old_worker->work_list)); return; } mutex_unlock(&vq->mutex); /* Make sure new vq queue/flush/poll calls see the new worker */ synchronize_rcu(); /* Make sure whatever was queued gets run */ __vhost_worker_flush(old_worker); old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); } /* Caller must have device mutex */ static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_vring_worker *info) { unsigned long index = info->worker_id; struct vhost_dev *dev = vq->dev; struct vhost_worker *worker; if (!dev->use_worker) return -EINVAL; worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); if (!worker || worker->id != info->worker_id) return -ENODEV; __vhost_vq_attach_worker(vq, worker); return 0; } /* Caller must have device mutex */ static int vhost_new_worker(struct vhost_dev *dev, struct vhost_worker_state *info) { struct vhost_worker *worker; worker = vhost_worker_create(dev); if (!worker) return -ENOMEM; info->worker_id = worker->id; return 0; } /* Caller must have device mutex */ static int vhost_free_worker(struct vhost_dev *dev, struct vhost_worker_state *info) { unsigned long index = info->worker_id; struct vhost_worker *worker; worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); if (!worker || worker->id != info->worker_id) return -ENODEV; mutex_lock(&worker->mutex); if (worker->attachment_cnt || worker->killed) { mutex_unlock(&worker->mutex); return -EBUSY; } /* * A flush might have raced and snuck in before attachment_cnt was set * to zero. Make sure flushes are flushed from the queue before * freeing. */ __vhost_worker_flush(worker); mutex_unlock(&worker->mutex); vhost_worker_destroy(dev, worker); return 0; } static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, struct vhost_virtqueue **vq, u32 *id) { u32 __user *idxp = argp; u32 idx; long r; r = get_user(idx, idxp); if (r < 0) return r; if (idx >= dev->nvqs) return -ENOBUFS; idx = array_index_nospec(idx, dev->nvqs); *vq = dev->vqs[idx]; *id = idx; return 0; } /* Caller must have device mutex */ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, void __user *argp) { struct vhost_vring_worker ring_worker; struct vhost_worker_state state; struct vhost_worker *worker; struct vhost_virtqueue *vq; long ret; u32 idx; if (!dev->use_worker) return -EINVAL; if (!vhost_dev_has_owner(dev)) return -EINVAL; ret = vhost_dev_check_owner(dev); if (ret) return ret; switch (ioctl) { /* dev worker ioctls */ case VHOST_NEW_WORKER: ret = vhost_new_worker(dev, &state); if (!ret && copy_to_user(argp, &state, sizeof(state))) ret = -EFAULT; return ret; case VHOST_FREE_WORKER: if (copy_from_user(&state, argp, sizeof(state))) return -EFAULT; return vhost_free_worker(dev, &state); /* vring worker ioctls */ case VHOST_ATTACH_VRING_WORKER: case VHOST_GET_VRING_WORKER: break; default: return -ENOIOCTLCMD; } ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); if (ret) return ret; switch (ioctl) { case VHOST_ATTACH_VRING_WORKER: if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { ret = -EFAULT; break; } ret = vhost_vq_attach_worker(vq, &ring_worker); break; case VHOST_GET_VRING_WORKER: worker = rcu_dereference_check(vq->worker, lockdep_is_held(&dev->mutex)); if (!worker) { ret = -EINVAL; break; } ring_worker.index = idx; ring_worker.worker_id = worker->id; if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) ret = -EFAULT; break; default: ret = -ENOIOCTLCMD; break; } return ret; } EXPORT_SYMBOL_GPL(vhost_worker_ioctl); /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { struct vhost_worker *worker; int err, i; /* Is there an owner already? */ if (vhost_dev_has_owner(dev)) { err = -EBUSY; goto err_mm; } vhost_attach_mm(dev); err = vhost_dev_alloc_iovecs(dev); if (err) goto err_iovecs; if (dev->use_worker) { /* * This should be done last, because vsock can queue work * before VHOST_SET_OWNER so it simplifies the failure path * below since we don't have to worry about vsock queueing * while we free the worker. */ worker = vhost_worker_create(dev); if (!worker) { err = -ENOMEM; goto err_worker; } for (i = 0; i < dev->nvqs; i++) __vhost_vq_attach_worker(dev->vqs[i], worker); } return 0; err_worker: vhost_dev_free_iovecs(dev); err_iovecs: vhost_detach_mm(dev); err_mm: return err; } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); static struct vhost_iotlb *iotlb_alloc(void) { return vhost_iotlb_alloc(max_iotlb_entries, VHOST_IOTLB_FLAG_RETIRE); } struct vhost_iotlb *vhost_dev_reset_owner_prepare(void) { return iotlb_alloc(); } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); /* Caller should have device mutex */ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) { int i; vhost_dev_cleanup(dev); dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. */ for (i = 0; i < dev->nvqs; ++i) dev->vqs[i]->umem = umem; } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner); void vhost_dev_stop(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) vhost_poll_stop(&dev->vqs[i]->poll); } vhost_dev_flush(dev); } EXPORT_SYMBOL_GPL(vhost_dev_stop); void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; spin_lock(&dev->iotlb_lock); list_for_each_entry_safe(node, n, &dev->read_list, node) { list_del(&node->node); kfree(node); } list_for_each_entry_safe(node, n, &dev->pending_list, node) { list_del(&node->node); kfree(node); } spin_unlock(&dev->iotlb_lock); } EXPORT_SYMBOL_GPL(vhost_clear_msg); void vhost_dev_cleanup(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i]->error_ctx) eventfd_ctx_put(dev->vqs[i]->error_ctx); if (dev->vqs[i]->kick) fput(dev->vqs[i]->kick); if (dev->vqs[i]->call_ctx.ctx) eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx); vhost_vq_reset(dev, dev->vqs[i]); } vhost_dev_free_iovecs(dev); if (dev->log_ctx) eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; /* No one will access memory at this point */ vhost_iotlb_free(dev->umem); dev->umem = NULL; vhost_iotlb_free(dev->iotlb); dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); vhost_workers_free(dev); vhost_detach_mm(dev); } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) return false; return access_ok(log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); } /* Make sure 64 bit math will not overflow. */ static bool vhost_overflow(u64 uaddr, u64 size) { if (uaddr > ULONG_MAX || size > ULONG_MAX) return true; if (!size) return false; return uaddr > ULONG_MAX - size + 1; } /* Caller should have vq mutex and device mutex. */ static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem, int log_all) { struct vhost_iotlb_map *map; if (!umem) return false; list_for_each_entry(map, &umem->list, link) { unsigned long a = map->addr; if (vhost_overflow(map->addr, map->size)) return false; if (!access_ok((void __user *)a, map->size)) return false; else if (log_all && !log_access_ok(log_base, map->start, map->size)) return false; } return true; } static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, u64 addr, unsigned int size, int type) { const struct vhost_iotlb_map *map = vq->meta_iotlb[type]; if (!map) return NULL; return (void __user *)(uintptr_t)(map->addr + addr - map->start); } /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem, int log_all) { int i; for (i = 0; i < d->nvqs; ++i) { bool ok; bool log; mutex_lock(&d->vqs[i]->mutex); log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL); /* If ring is inactive, will check when it's enabled. */ if (d->vqs[i]->private_data) ok = vq_memory_access_ok(d->vqs[i]->log_base, umem, log); else ok = true; mutex_unlock(&d->vqs[i]->mutex); if (!ok) return false; } return true; } static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access); static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to, const void *from, unsigned size) { int ret; if (!vq->iotlb) return __copy_to_user(to, from, size); else { /* This function should be called after iotlb * prefetch, which means we're sure that all vq * could be access through iotlb. So -EAGAIN should * not happen in this case. */ struct iov_iter t; void __user *uaddr = vhost_vq_meta_fetch(vq, (u64)(uintptr_t)to, size, VHOST_ADDR_USED); if (uaddr) return __copy_to_user(uaddr, from, size); ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov, ARRAY_SIZE(vq->iotlb_iov), VHOST_ACCESS_WO); if (ret < 0) goto out; iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size); ret = copy_to_iter(from, size, &t); if (ret == size) ret = 0; } out: return ret; } static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to, void __user *from, unsigned size) { int ret; if (!vq->iotlb) return __copy_from_user(to, from, size); else { /* This function should be called after iotlb * prefetch, which means we're sure that vq * could be access through iotlb. So -EAGAIN should * not happen in this case. */ void __user *uaddr = vhost_vq_meta_fetch(vq, (u64)(uintptr_t)from, size, VHOST_ADDR_DESC); struct iov_iter f; if (uaddr) return __copy_from_user(to, uaddr, size); ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov, ARRAY_SIZE(vq->iotlb_iov), VHOST_ACCESS_RO); if (ret < 0) { vq_err(vq, "IOTLB translation failure: uaddr " "%p size 0x%llx\n", from, (unsigned long long) size); goto out; } iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size); ret = copy_from_iter(to, size, &f); if (ret == size) ret = 0; } out: return ret; } static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq, void __user *addr, unsigned int size, int type) { int ret; ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov, ARRAY_SIZE(vq->iotlb_iov), VHOST_ACCESS_RO); if (ret < 0) { vq_err(vq, "IOTLB translation failure: uaddr " "%p size 0x%llx\n", addr, (unsigned long long) size); return NULL; } if (ret != 1 || vq->iotlb_iov[0].iov_len != size) { vq_err(vq, "Non atomic userspace memory access: uaddr " "%p size 0x%llx\n", addr, (unsigned long long) size); return NULL; } return vq->iotlb_iov[0].iov_base; } /* This function should be called after iotlb * prefetch, which means we're sure that vq * could be access through iotlb. So -EAGAIN should * not happen in this case. */ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, void __user *addr, unsigned int size, int type) { void __user *uaddr = vhost_vq_meta_fetch(vq, (u64)(uintptr_t)addr, size, type); if (uaddr) return uaddr; return __vhost_get_user_slow(vq, addr, size, type); } #define vhost_put_user(vq, x, ptr) \ ({ \ int ret; \ if (!vq->iotlb) { \ ret = __put_user(x, ptr); \ } else { \ __typeof__(ptr) to = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), VHOST_ADDR_USED); \ if (to != NULL) \ ret = __put_user(x, to); \ else \ ret = -EFAULT; \ } \ ret; \ }) static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) { return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)); } static inline int vhost_put_used(struct vhost_virtqueue *vq, struct vring_used_elem *head, int idx, int count) { return vhost_copy_to_user(vq, vq->used->ring + idx, head, count * sizeof(*head)); } static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) { return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags); } static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) { return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx); } #define vhost_get_user(vq, x, ptr, type) \ ({ \ int ret; \ if (!vq->iotlb) { \ ret = __get_user(x, ptr); \ } else { \ __typeof__(ptr) from = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), \ type); \ if (from != NULL) \ ret = __get_user(x, from); \ else \ ret = -EFAULT; \ } \ ret; \ }) #define vhost_get_avail(vq, x, ptr) \ vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL) #define vhost_get_used(vq, x, ptr) \ vhost_get_user(vq, x, ptr, VHOST_ADDR_USED) static void vhost_dev_lock_vqs(struct vhost_dev *d) { int i = 0; for (i = 0; i < d->nvqs; ++i) mutex_lock_nested(&d->vqs[i]->mutex, i); } static void vhost_dev_unlock_vqs(struct vhost_dev *d) { int i = 0; for (i = 0; i < d->nvqs; ++i) mutex_unlock(&d->vqs[i]->mutex); } static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, __virtio16 *idx) { return vhost_get_avail(vq, *idx, &vq->avail->idx); } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, __virtio16 *head, int idx) { return vhost_get_avail(vq, *head, &vq->avail->ring[idx & (vq->num - 1)]); } static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, __virtio16 *flags) { return vhost_get_avail(vq, *flags, &vq->avail->flags); } static inline int vhost_get_used_event(struct vhost_virtqueue *vq, __virtio16 *event) { return vhost_get_avail(vq, *event, vhost_used_event(vq)); } static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, __virtio16 *idx) { return vhost_get_used(vq, *idx, &vq->used->idx); } static inline int vhost_get_desc(struct vhost_virtqueue *vq, struct vring_desc *desc, int idx) { return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); } static void vhost_iotlb_notify_vq(struct vhost_dev *d, struct vhost_iotlb_msg *msg) { struct vhost_msg_node *node, *n; spin_lock(&d->iotlb_lock); list_for_each_entry_safe(node, n, &d->pending_list, node) { struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb; if (msg->iova <= vq_msg->iova && msg->iova + msg->size - 1 >= vq_msg->iova && vq_msg->type == VHOST_IOTLB_MISS) { vhost_poll_queue(&node->vq->poll); list_del(&node->node); kfree(node); } } spin_unlock(&d->iotlb_lock); } static bool umem_access_ok(u64 uaddr, u64 size, int access) { unsigned long a = uaddr; /* Make sure 64 bit math will not overflow. */ if (vhost_overflow(uaddr, size)) return false; if ((access & VHOST_ACCESS_RO) && !access_ok((void __user *)a, size)) return false; if ((access & VHOST_ACCESS_WO) && !access_ok((void __user *)a, size)) return false; return true; } static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg) { int ret = 0; if (asid != 0) return -EINVAL; mutex_lock(&dev->mutex); vhost_dev_lock_vqs(dev); switch (msg->type) { case VHOST_IOTLB_UPDATE: if (!dev->iotlb) { ret = -EFAULT; break; } if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) { ret = -EFAULT; break; } vhost_vq_meta_reset(dev); if (vhost_iotlb_add_range(dev->iotlb, msg->iova, msg->iova + msg->size - 1, msg->uaddr, msg->perm)) { ret = -ENOMEM; break; } vhost_iotlb_notify_vq(dev, msg); break; case VHOST_IOTLB_INVALIDATE: if (!dev->iotlb) { ret = -EFAULT; break; } vhost_vq_meta_reset(dev); vhost_iotlb_del_range(dev->iotlb, msg->iova, msg->iova + msg->size - 1); break; default: ret = -EINVAL; break; } vhost_dev_unlock_vqs(dev); mutex_unlock(&dev->mutex); return ret; } ssize_t vhost_chr_write_iter(struct vhost_dev *dev, struct iov_iter *from) { struct vhost_iotlb_msg msg; size_t offset; int type, ret; u32 asid = 0; ret = copy_from_iter(&type, sizeof(type), from); if (ret != sizeof(type)) { ret = -EINVAL; goto done; } switch (type) { case VHOST_IOTLB_MSG: /* There maybe a hole after type for V1 message type, * so skip it here. */ offset = offsetof(struct vhost_msg, iotlb) - sizeof(int); break; case VHOST_IOTLB_MSG_V2: if (vhost_backend_has_feature(dev->vqs[0], VHOST_BACKEND_F_IOTLB_ASID)) { ret = copy_from_iter(&asid, sizeof(asid), from); if (ret != sizeof(asid)) { ret = -EINVAL; goto done; } offset = 0; } else offset = sizeof(__u32); break; default: ret = -EINVAL; goto done; } iov_iter_advance(from, offset); ret = copy_from_iter(&msg, sizeof(msg), from); if (ret != sizeof(msg)) { ret = -EINVAL; goto done; } if (msg.type == VHOST_IOTLB_UPDATE && msg.size == 0) { ret = -EINVAL; goto done; } if (dev->msg_handler) ret = dev->msg_handler(dev, asid, &msg); else ret = vhost_process_iotlb_msg(dev, asid, &msg); if (ret) { ret = -EFAULT; goto done; } ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) : sizeof(struct vhost_msg_v2); done: return ret; } EXPORT_SYMBOL(vhost_chr_write_iter); __poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev, poll_table *wait) { __poll_t mask = 0; poll_wait(file, &dev->wait, wait); if (!list_empty(&dev->read_list)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_SYMBOL(vhost_chr_poll); ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to, int noblock) { DEFINE_WAIT(wait); struct vhost_msg_node *node; ssize_t ret = 0; unsigned size = sizeof(struct vhost_msg); if (iov_iter_count(to) < size) return 0; while (1) { if (!noblock) prepare_to_wait(&dev->wait, &wait, TASK_INTERRUPTIBLE); node = vhost_dequeue_msg(dev, &dev->read_list); if (node) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!dev->iotlb) { ret = -EBADFD; break; } schedule(); } if (!noblock) finish_wait(&dev->wait, &wait); if (node) { struct vhost_iotlb_msg *msg; void *start = &node->msg; switch (node->msg.type) { case VHOST_IOTLB_MSG: size = sizeof(node->msg); msg = &node->msg.iotlb; break; case VHOST_IOTLB_MSG_V2: size = sizeof(node->msg_v2); msg = &node->msg_v2.iotlb; break; default: BUG(); break; } ret = copy_to_iter(start, size, to); if (ret != size || msg->type != VHOST_IOTLB_MISS) { kfree(node); return ret; } vhost_enqueue_msg(dev, &dev->pending_list, node); } return ret; } EXPORT_SYMBOL_GPL(vhost_chr_read_iter); static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) { struct vhost_dev *dev = vq->dev; struct vhost_msg_node *node; struct vhost_iotlb_msg *msg; bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2); node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG); if (!node) return -ENOMEM; if (v2) { node->msg_v2.type = VHOST_IOTLB_MSG_V2; msg = &node->msg_v2.iotlb; } else { msg = &node->msg.iotlb; } msg->type = VHOST_IOTLB_MISS; msg->iova = iova; msg->perm = access; vhost_enqueue_msg(dev, &dev->read_list, node); return 0; } static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, vring_desc_t __user *desc, vring_avail_t __user *avail, vring_used_t __user *used) { /* If an IOTLB device is present, the vring addresses are * GIOVAs. Access validation occurs at prefetch time. */ if (vq->iotlb) return true; return access_ok(desc, vhost_get_desc_size(vq, num)) && access_ok(avail, vhost_get_avail_size(vq, num)) && access_ok(used, vhost_get_used_size(vq, num)); } static void vhost_vq_meta_update(struct vhost_virtqueue *vq, const struct vhost_iotlb_map *map, int type) { int access = (type == VHOST_ADDR_USED) ? VHOST_ACCESS_WO : VHOST_ACCESS_RO; if (likely(map->perm & access)) vq->meta_iotlb[type] = map; } static bool iotlb_access_ok(struct vhost_virtqueue *vq, int access, u64 addr, u64 len, int type) { const struct vhost_iotlb_map *map; struct vhost_iotlb *umem = vq->iotlb; u64 s = 0, size, orig_addr = addr, last = addr + len - 1; if (vhost_vq_meta_fetch(vq, addr, len, type)) return true; while (len > s) { map = vhost_iotlb_itree_first(umem, addr, last); if (map == NULL || map->start > addr) { vhost_iotlb_miss(vq, addr, access); return false; } else if (!(map->perm & access)) { /* Report the possible access violation by * request another translation from userspace. */ return false; } size = map->size - addr + map->start; if (orig_addr == addr && size >= len) vhost_vq_meta_update(vq, map, type); s += size; addr += size; } return true; } int vq_meta_prefetch(struct vhost_virtqueue *vq) { unsigned int num = vq->num; if (!vq->iotlb) return 1; return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc, vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) && iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail, vhost_get_avail_size(vq, num), VHOST_ADDR_AVAIL) && iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used, vhost_get_used_size(vq, num), VHOST_ADDR_USED); } EXPORT_SYMBOL_GPL(vq_meta_prefetch); /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ bool vhost_log_access_ok(struct vhost_dev *dev) { return memory_access_ok(dev, dev->umem, 1); } EXPORT_SYMBOL_GPL(vhost_log_access_ok); static bool vq_log_used_access_ok(struct vhost_virtqueue *vq, void __user *log_base, bool log_used, u64 log_addr) { /* If an IOTLB device is present, log_addr is a GIOVA that * will never be logged by log_used(). */ if (vq->iotlb) return true; return !log_used || log_access_ok(log_base, log_addr, vhost_get_used_size(vq, vq->num)); } /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static bool vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { return vq_memory_access_ok(log_base, vq->umem, vhost_has_feature(vq, VHOST_F_LOG_ALL)) && vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr); } /* Can we start vq? */ /* Caller should have vq mutex and device mutex */ bool vhost_vq_access_ok(struct vhost_virtqueue *vq) { if (!vq_log_access_ok(vq, vq->log_base)) return false; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem; struct vhost_memory_region *region; struct vhost_iotlb *newumem, *oldumem; unsigned long size = offsetof(struct vhost_memory, regions); int i; if (copy_from_user(&mem, m, size)) return -EFAULT; if (mem.padding) return -EOPNOTSUPP; if (mem.nregions > max_mem_regions) return -E2BIG; newmem = kvzalloc(struct_size(newmem, regions, mem.nregions), GFP_KERNEL); if (!newmem) return -ENOMEM; memcpy(newmem, &mem, size); if (copy_from_user(newmem->regions, m->regions, flex_array_size(newmem, regions, mem.nregions))) { kvfree(newmem); return -EFAULT; } newumem = iotlb_alloc(); if (!newumem) { kvfree(newmem); return -ENOMEM; } for (region = newmem->regions; region < newmem->regions + mem.nregions; region++) { if (vhost_iotlb_add_range(newumem, region->guest_phys_addr, region->guest_phys_addr + region->memory_size - 1, region->userspace_addr, VHOST_MAP_RW)) goto err; } if (!memory_access_ok(d, newumem, 0)) goto err; oldumem = d->umem; d->umem = newumem; /* All memory accesses are done under some VQ mutex. */ for (i = 0; i < d->nvqs; ++i) { mutex_lock(&d->vqs[i]->mutex); d->vqs[i]->umem = newumem; mutex_unlock(&d->vqs[i]->mutex); } kvfree(newmem); vhost_iotlb_free(oldumem); return 0; err: vhost_iotlb_free(newumem); kvfree(newmem); return -EFAULT; } static long vhost_vring_set_num(struct vhost_dev *d, struct vhost_virtqueue *vq, void __user *argp) { struct vhost_vring_state s; /* Resizing ring with an active backend? * You don't want to do that. */ if (vq->private_data) return -EBUSY; if (copy_from_user(&s, argp, sizeof s)) return -EFAULT; if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) return -EINVAL; vq->num = s.num; return 0; } static long vhost_vring_set_addr(struct vhost_dev *d, struct vhost_virtqueue *vq, void __user *argp) { struct vhost_vring_addr a; if (copy_from_user(&a, argp, sizeof a)) return -EFAULT; if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) return -EOPNOTSUPP; /* For 32bit, verify that the top 32bits of the user data are set to zero. */ if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || (u64)(unsigned long)a.used_user_addr != a.used_user_addr || (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) return -EFAULT; /* Make sure it's safe to cast pointers to vring types. */ BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) return -EINVAL; /* We only verify access here if backend is configured. * If it is not, we don't as size might not have been setup. * We will verify when backend is configured. */ if (vq->private_data) { if (!vq_access_ok(vq, vq->num, (void __user *)(unsigned long)a.desc_user_addr, (void __user *)(unsigned long)a.avail_user_addr, (void __user *)(unsigned long)a.used_user_addr)) return -EINVAL; /* Also validate log access for used ring if enabled. */ if (!vq_log_used_access_ok(vq, vq->log_base, a.flags & (0x1 << VHOST_VRING_F_LOG), a.log_guest_addr)) return -EINVAL; } vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); vq->desc = (void __user *)(unsigned long)a.desc_user_addr; vq->avail = (void __user *)(unsigned long)a.avail_user_addr; vq->log_addr = a.log_guest_addr; vq->used = (void __user *)(unsigned long)a.used_user_addr; return 0; } static long vhost_vring_set_num_addr(struct vhost_dev *d, struct vhost_virtqueue *vq, unsigned int ioctl, void __user *argp) { long r; mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_SET_VRING_NUM: r = vhost_vring_set_num(d, vq, argp); break; case VHOST_SET_VRING_ADDR: r = vhost_vring_set_addr(d, vq, argp); break; default: BUG(); } mutex_unlock(&vq->mutex); return r; } long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { struct file *eventfp, *filep = NULL; bool pollstart = false, pollstop = false; struct eventfd_ctx *ctx = NULL; struct vhost_virtqueue *vq; struct vhost_vring_state s; struct vhost_vring_file f; u32 idx; long r; r = vhost_get_vq_from_user(d, argp, &vq, &idx); if (r < 0) return r; if (ioctl == VHOST_SET_VRING_NUM || ioctl == VHOST_SET_VRING_ADDR) { return vhost_vring_set_num_addr(d, vq, ioctl, argp); } mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_SET_VRING_BASE: /* Moving base with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } if (copy_from_user(&s, argp, sizeof s)) { r = -EFAULT; break; } if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { vq->last_avail_idx = s.num & 0xffff; vq->last_used_idx = (s.num >> 16) & 0xffff; } else { if (s.num > 0xffff) { r = -EINVAL; break; } vq->last_avail_idx = s.num; } /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16); else s.num = vq->last_avail_idx; if (copy_to_user(argp, &s, sizeof s)) r = -EFAULT; break; case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->kick) { pollstop = (filep = vq->kick) != NULL; pollstart = (vq->kick = eventfp) != NULL; } else filep = eventfp; break; case VHOST_SET_VRING_CALL: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, vq->call_ctx.ctx); break; case VHOST_SET_VRING_ERR: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, vq->error_ctx); break; case VHOST_SET_VRING_ENDIAN: r = vhost_set_vring_endian(vq, argp); break; case VHOST_GET_VRING_ENDIAN: r = vhost_get_vring_endian(vq, idx, argp); break; case VHOST_SET_VRING_BUSYLOOP_TIMEOUT: if (copy_from_user(&s, argp, sizeof(s))) { r = -EFAULT; break; } vq->busyloop_timeout = s.num; break; case VHOST_GET_VRING_BUSYLOOP_TIMEOUT: s.index = idx; s.num = vq->busyloop_timeout; if (copy_to_user(argp, &s, sizeof(s))) r = -EFAULT; break; default: r = -ENOIOCTLCMD; } if (pollstop && vq->handle_kick) vhost_poll_stop(&vq->poll); if (!IS_ERR_OR_NULL(ctx)) eventfd_ctx_put(ctx); if (filep) fput(filep); if (pollstart && vq->handle_kick) r = vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_dev_flush(vq->poll.dev); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); int vhost_init_device_iotlb(struct vhost_dev *d) { struct vhost_iotlb *niotlb, *oiotlb; int i; niotlb = iotlb_alloc(); if (!niotlb) return -ENOMEM; oiotlb = d->iotlb; d->iotlb = niotlb; for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq = d->vqs[i]; mutex_lock(&vq->mutex); vq->iotlb = niotlb; __vhost_vq_meta_reset(vq); mutex_unlock(&vq->mutex); } vhost_iotlb_free(oiotlb); return 0; } EXPORT_SYMBOL_GPL(vhost_init_device_iotlb); /* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { struct eventfd_ctx *ctx; u64 p; long r; int i, fd; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { r = vhost_dev_set_owner(d); goto done; } /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) goto done; switch (ioctl) { case VHOST_SET_MEM_TABLE: r = vhost_set_memory(d, argp); break; case VHOST_SET_LOG_BASE: if (copy_from_user(&p, argp, sizeof p)) { r = -EFAULT; break; } if ((u64)(unsigned long)p != p) { r = -EFAULT; break; } for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; vq = d->vqs[i]; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(vq, base)) r = -EFAULT; else vq->log_base = base; mutex_unlock(&vq->mutex); } break; case VHOST_SET_LOG_FD: r = get_user(fd, (int __user *)argp); if (r < 0) break; ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, d->log_ctx); for (i = 0; i < d->nvqs; ++i) { mutex_lock(&d->vqs[i]->mutex); d->vqs[i]->log_ctx = d->log_ctx; mutex_unlock(&d->vqs[i]->mutex); } if (ctx) eventfd_ctx_put(ctx); break; default: r = -ENOIOCTLCMD; break; } done: return r; } EXPORT_SYMBOL_GPL(vhost_dev_ioctl); /* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst. */ static int set_bit_to_user(int nr, void __user *addr) { unsigned long log = (unsigned long)addr; struct page *page; void *base; int bit = nr + (log % PAGE_SIZE) * 8; int r; r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); unpin_user_pages_dirty_lock(&page, 1, true); return 0; } static int log_write(void __user *log_base, u64 write_address, u64 write_length) { u64 write_page = write_address / VHOST_PAGE_SIZE; int r; if (!write_length) return 0; write_length += write_address % VHOST_PAGE_SIZE; for (;;) { u64 base = (u64)(unsigned long)log_base; u64 log = base + write_page / 8; int bit = write_page % 8; if ((u64)(unsigned long)log != log) return -EFAULT; r = set_bit_to_user(bit, (void __user *)(unsigned long)log); if (r < 0) return r; if (write_length <= VHOST_PAGE_SIZE) break; write_length -= VHOST_PAGE_SIZE; write_page += 1; } return r; } static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) { struct vhost_iotlb *umem = vq->umem; struct vhost_iotlb_map *u; u64 start, end, l, min; int r; bool hit = false; while (len) { min = len; /* More than one GPAs can be mapped into a single HVA. So * iterate all possible umems here to be safe. */ list_for_each_entry(u, &umem->list, link) { if (u->addr > hva - 1 + len || u->addr - 1 + u->size < hva) continue; start = max(u->addr, hva); end = min(u->addr - 1 + u->size, hva - 1 + len); l = end - start + 1; r = log_write(vq->log_base, u->start + start - u->addr, l); if (r < 0) return r; hit = true; min = min(l, min); } if (!hit) return -EFAULT; len -= min; hva += min; } return 0; } static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) { struct iovec *iov = vq->log_iov; int i, ret; if (!vq->iotlb) return log_write(vq->log_base, vq->log_addr + used_offset, len); ret = translate_desc(vq, (uintptr_t)vq->used + used_offset, len, iov, 64, VHOST_ACCESS_WO); if (ret < 0) return ret; for (i = 0; i < ret; i++) { ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base, iov[i].iov_len); if (ret) return ret; } return 0; } int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len, struct iovec *iov, int count) { int i, r; /* Make sure data written is seen before log. */ smp_wmb(); if (vq->iotlb) { for (i = 0; i < count; i++) { r = log_write_hva(vq, (uintptr_t)iov[i].iov_base, iov[i].iov_len); if (r < 0) return r; } return 0; } for (i = 0; i < log_num; ++i) { u64 l = min(log[i].len, len); r = log_write(vq->log_base, log[i].addr, l); if (r < 0) return r; len -= l; if (!len) { if (vq->log_ctx) eventfd_signal(vq->log_ctx); return 0; } } /* Length written exceeds what we have stored. This is a bug. */ BUG(); return 0; } EXPORT_SYMBOL_GPL(vhost_log_write); static int vhost_update_used_flags(struct vhost_virtqueue *vq) { void __user *used; if (vhost_put_used_flags(vq)) return -EFAULT; if (unlikely(vq->log_used)) { /* Make sure the flag is seen before log. */ smp_wmb(); /* Log used flag write. */ used = &vq->used->flags; log_used(vq, (used - (void __user *)vq->used), sizeof vq->used->flags); if (vq->log_ctx) eventfd_signal(vq->log_ctx); } return 0; } static int vhost_update_avail_event(struct vhost_virtqueue *vq) { if (vhost_put_avail_event(vq)) return -EFAULT; if (unlikely(vq->log_used)) { void __user *used; /* Make sure the event is seen before log. */ smp_wmb(); /* Log avail event write */ used = vhost_avail_event(vq); log_used(vq, (used - (void __user *)vq->used), sizeof *vhost_avail_event(vq)); if (vq->log_ctx) eventfd_signal(vq->log_ctx); } return 0; } int vhost_vq_init_access(struct vhost_virtqueue *vq) { __virtio16 last_used_idx; int r; bool is_le = vq->is_le; if (!vq->private_data) return 0; vhost_init_is_le(vq); r = vhost_update_used_flags(vq); if (r) goto err; vq->signalled_used_valid = false; if (!vq->iotlb && !access_ok(&vq->used->idx, sizeof vq->used->idx)) { r = -EFAULT; goto err; } r = vhost_get_used_idx(vq, &last_used_idx); if (r) { vq_err(vq, "Can't access used idx at %p\n", &vq->used->idx); goto err; } vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); return 0; err: vq->is_le = is_le; return r; } EXPORT_SYMBOL_GPL(vhost_vq_init_access); static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access) { const struct vhost_iotlb_map *map; struct vhost_dev *dev = vq->dev; struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; u64 s = 0, last = addr + len - 1; int ret = 0; while ((u64)len > s) { u64 size; if (unlikely(ret >= iov_size)) { ret = -ENOBUFS; break; } map = vhost_iotlb_itree_first(umem, addr, last); if (map == NULL || map->start > addr) { if (umem != dev->iotlb) { ret = -EFAULT; break; } ret = -EAGAIN; break; } else if (!(map->perm & access)) { ret = -EPERM; break; } _iov = iov + ret; size = map->size - addr + map->start; _iov->iov_len = min((u64)len - s, size); _iov->iov_base = (void __user *)(unsigned long) (map->addr + addr - map->start); s += size; addr += size; ++ret; } if (ret == -EAGAIN) vhost_iotlb_miss(vq, addr, access); return ret; } /* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, * or -1U if we're at the end. */ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT))) return -1U; /* Check they're not leading us off end of descriptors. */ next = vhost16_to_cpu(vq, READ_ONCE(desc->next)); return next; } static int get_indirect(struct vhost_virtqueue *vq, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num, struct vring_desc *indirect) { struct vring_desc desc; unsigned int i = 0, count, found = 0; u32 len = vhost32_to_cpu(vq, indirect->len); struct iov_iter from; int ret, access; /* Sanity check */ if (unlikely(len % sizeof desc)) { vq_err(vq, "Invalid length in indirect descriptor: " "len 0x%llx not multiple of 0x%zx\n", (unsigned long long)len, sizeof desc); return -EINVAL; } ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect, UIO_MAXIOV, VHOST_ACCESS_RO); if (unlikely(ret < 0)) { if (ret != -EAGAIN) vq_err(vq, "Translation failure %d in indirect.\n", ret); return ret; } iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len); count = len / sizeof desc; /* Buffers are chained via a 16 bit next field, so * we can have at most 2^16 of these. */ if (unlikely(count > USHRT_MAX + 1)) { vq_err(vq, "Indirect buffer length too big: %d\n", indirect->len); return -E2BIG; } do { unsigned iov_count = *in_num + *out_num; if (unlikely(++found > count)) { vq_err(vq, "Loop detected: last one at %u " "indirect size %u\n", i, count); return -EINVAL; } if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) { vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); return -EINVAL; } if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) { vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); return -EINVAL; } if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) access = VHOST_ACCESS_WO; else access = VHOST_ACCESS_RO; ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), vhost32_to_cpu(vq, desc.len), iov + iov_count, iov_size - iov_count, access); if (unlikely(ret < 0)) { if (ret != -EAGAIN) vq_err(vq, "Translation failure %d indirect idx %d\n", ret, i); return ret; } /* If this is an input descriptor, increment that count. */ if (access == VHOST_ACCESS_WO) { *in_num += ret; if (unlikely(log && ret)) { log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); log[*log_num].len = vhost32_to_cpu(vq, desc.len); ++*log_num; } } else { /* If it's an output descriptor, they're all supposed * to come before any input descriptors. */ if (unlikely(*in_num)) { vq_err(vq, "Indirect descriptor " "has out after in: idx %d\n", i); return -EINVAL; } *out_num += ret; } } while ((i = next_desc(vq, &desc)) != -1); return 0; } /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * * This function returns the descriptor number found, or vq->num (which is * never a valid descriptor number) if none was found. A negative code is * returned on error. */ int vhost_get_vq_desc(struct vhost_virtqueue *vq, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num) { struct vring_desc desc; unsigned int i, head, found = 0; u16 last_avail_idx; __virtio16 avail_idx; __virtio16 ring_head; int ret, access; /* Check it isn't doing very strange things with descriptor numbers. */ last_avail_idx = vq->last_avail_idx; if (vq->avail_idx == vq->last_avail_idx) { if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { vq_err(vq, "Failed to access avail idx at %p\n", &vq->avail->idx); return -EFAULT; } vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { vq_err(vq, "Guest moved avail index from %u to %u", last_avail_idx, vq->avail_idx); return -EFAULT; } /* If there's nothing new since last we looked, return * invalid. */ if (vq->avail_idx == last_avail_idx) return vq->num; /* Only get avail ring entries after they have been * exposed by guest. */ smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", last_avail_idx, &vq->avail->ring[last_avail_idx % vq->num]); return -EFAULT; } head = vhost16_to_cpu(vq, ring_head); /* If their number is silly, that's an error. */ if (unlikely(head >= vq->num)) { vq_err(vq, "Guest says index %u > %u is available", head, vq->num); return -EINVAL; } /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; if (unlikely(log)) *log_num = 0; i = head; do { unsigned iov_count = *in_num + *out_num; if (unlikely(i >= vq->num)) { vq_err(vq, "Desc index is %u > %u, head = %u", i, vq->num, head); return -EINVAL; } if (unlikely(++found > vq->num)) { vq_err(vq, "Loop detected: last one at %u " "vq size %u head %u\n", i, vq->num, head); return -EINVAL; } ret = vhost_get_desc(vq, &desc, i); if (unlikely(ret)) { vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", i, vq->desc + i); return -EFAULT; } if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) { ret = get_indirect(vq, iov, iov_size, out_num, in_num, log, log_num, &desc); if (unlikely(ret < 0)) { if (ret != -EAGAIN) vq_err(vq, "Failure detected " "in indirect descriptor at idx %d\n", i); return ret; } continue; } if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) access = VHOST_ACCESS_WO; else access = VHOST_ACCESS_RO; ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), vhost32_to_cpu(vq, desc.len), iov + iov_count, iov_size - iov_count, access); if (unlikely(ret < 0)) { if (ret != -EAGAIN) vq_err(vq, "Translation failure %d descriptor idx %d\n", ret, i); return ret; } if (access == VHOST_ACCESS_WO) { /* If this is an input descriptor, * increment that count. */ *in_num += ret; if (unlikely(log && ret)) { log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); log[*log_num].len = vhost32_to_cpu(vq, desc.len); ++*log_num; } } else { /* If it's an output descriptor, they're all supposed * to come before any input descriptors. */ if (unlikely(*in_num)) { vq_err(vq, "Descriptor has out after in: " "idx %d\n", i); return -EINVAL; } *out_num += ret; } } while ((i = next_desc(vq, &desc)) != -1); /* On success, increment avail index. */ vq->last_avail_idx++; /* Assume notifications from guest are disabled at this point, * if they aren't we would need to update avail_event index. */ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); return head; } EXPORT_SYMBOL_GPL(vhost_get_vq_desc); /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) { vq->last_avail_idx -= n; } EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); /* After we've used one of their buffers, we tell them about it. We'll then * want to notify the guest, using eventfd. */ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) { struct vring_used_elem heads = { cpu_to_vhost32(vq, head), cpu_to_vhost32(vq, len) }; return vhost_add_used_n(vq, &heads, 1); } EXPORT_SYMBOL_GPL(vhost_add_used); static int __vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) { vring_used_elem_t __user *used; u16 old, new; int start; start = vq->last_used_idx & (vq->num - 1); used = vq->used->ring + start; if (vhost_put_used(vq, heads, start, count)) { vq_err(vq, "Failed to write used"); return -EFAULT; } if (unlikely(vq->log_used)) { /* Make sure data is seen before log. */ smp_wmb(); /* Log used ring entry write. */ log_used(vq, ((void __user *)used - (void __user *)vq->used), count * sizeof *used); } old = vq->last_used_idx; new = (vq->last_used_idx += count); /* If the driver never bothers to signal in a very long while, * used index might wrap around. If that happens, invalidate * signalled_used index we stored. TODO: make sure driver * signals at least once in 2^16 and remove this. */ if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) vq->signalled_used_valid = false; return 0; } /* After we've used one of their buffers, we tell them about it. We'll then * want to notify the guest, using eventfd. */ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) { int start, n, r; start = vq->last_used_idx & (vq->num - 1); n = vq->num - start; if (n < count) { r = __vhost_add_used_n(vq, heads, n); if (r < 0) return r; heads += n; count -= n; } r = __vhost_add_used_n(vq, heads, count); /* Make sure buffer is written before we update index. */ smp_wmb(); if (vhost_put_used_idx(vq)) { vq_err(vq, "Failed to increment used idx"); return -EFAULT; } if (unlikely(vq->log_used)) { /* Make sure used idx is seen before log. */ smp_wmb(); /* Log used index update. */ log_used(vq, offsetof(struct vring_used, idx), sizeof vq->used->idx); if (vq->log_ctx) eventfd_signal(vq->log_ctx); } return r; } EXPORT_SYMBOL_GPL(vhost_add_used_n); static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __u16 old, new; __virtio16 event; bool v; /* Flush out used index updates. This is paired * with the barrier that the Guest executes when enabling * interrupts. */ smp_mb(); if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) && unlikely(vq->avail_idx == vq->last_avail_idx)) return true; if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { __virtio16 flags; if (vhost_get_avail_flags(vq, &flags)) { vq_err(vq, "Failed to get flags"); return true; } return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT)); } old = vq->signalled_used; v = vq->signalled_used_valid; new = vq->signalled_used = vq->last_used_idx; vq->signalled_used_valid = true; if (unlikely(!v)) return true; if (vhost_get_used_event(vq, &event)) { vq_err(vq, "Failed to get used event idx"); return true; } return vring_need_event(vhost16_to_cpu(vq, event), new, old); } /* This actually signals the guest, using eventfd. */ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { /* Signal the Guest tell them we used something up. */ if (vq->call_ctx.ctx && vhost_notify(dev, vq)) eventfd_signal(vq->call_ctx.ctx); } EXPORT_SYMBOL_GPL(vhost_signal); /* And here's the combo meal deal. Supersize me! */ void vhost_add_used_and_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, unsigned int head, int len) { vhost_add_used(vq, head, len); vhost_signal(dev, vq); } EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); /* multi-buffer version of vhost_add_used_and_signal */ void vhost_add_used_and_signal_n(struct vhost_dev *dev, struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) { vhost_add_used_n(vq, heads, count); vhost_signal(dev, vq); } EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __virtio16 avail_idx; int r; if (vq->avail_idx != vq->last_avail_idx) return false; r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (vq->avail_idx != vq->last_avail_idx) { /* Since we have updated avail_idx, the following * call to vhost_get_vq_desc() will read available * ring entries. Make sure that read happens after * the avail_idx read. */ smp_rmb(); return false; } return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) return false; vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { r = vhost_update_used_flags(vq); if (r) { vq_err(vq, "Failed to enable notification at %p: %d\n", &vq->used->flags, r); return false; } } else { r = vhost_update_avail_event(vq); if (r) { vq_err(vq, "Failed to update avail event index at %p: %d\n", vhost_avail_event(vq), r); return false; } } /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); r = vhost_get_avail_idx(vq, &avail_idx); if (r) { vq_err(vq, "Failed to check avail idx at %p: %d\n", &vq->avail->idx, r); return false; } vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (vq->avail_idx != vq->last_avail_idx) { /* Since we have updated avail_idx, the following * call to vhost_get_vq_desc() will read available * ring entries. Make sure that read happens after * the avail_idx read. */ smp_rmb(); return true; } return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); /* We don't need to be notified again. */ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { int r; if (vq->used_flags & VRING_USED_F_NO_NOTIFY) return; vq->used_flags |= VRING_USED_F_NO_NOTIFY; if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { r = vhost_update_used_flags(vq); if (r) vq_err(vq, "Failed to disable notification at %p: %d\n", &vq->used->flags, r); } } EXPORT_SYMBOL_GPL(vhost_disable_notify); /* Create a new message. */ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type) { /* Make sure all padding within the structure is initialized. */ struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return NULL; node->vq = vq; node->msg.type = type; return node; } EXPORT_SYMBOL_GPL(vhost_new_msg); void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head, struct vhost_msg_node *node) { spin_lock(&dev->iotlb_lock); list_add_tail(&node->node, head); spin_unlock(&dev->iotlb_lock); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); } EXPORT_SYMBOL_GPL(vhost_enqueue_msg); struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev, struct list_head *head) { struct vhost_msg_node *node = NULL; spin_lock(&dev->iotlb_lock); if (!list_empty(head)) { node = list_first_entry(head, struct vhost_msg_node, node); list_del(&node->node); } spin_unlock(&dev->iotlb_lock); return node; } EXPORT_SYMBOL_GPL(vhost_dequeue_msg); void vhost_set_backend_features(struct vhost_dev *dev, u64 features) { struct vhost_virtqueue *vq; int i; mutex_lock(&dev->mutex); for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; mutex_lock(&vq->mutex); vq->acked_backend_features = features; mutex_unlock(&vq->mutex); } mutex_unlock(&dev->mutex); } EXPORT_SYMBOL_GPL(vhost_set_backend_features); static int __init vhost_init(void) { return 0; } static void __exit vhost_exit(void) { } module_init(vhost_init); module_exit(vhost_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio");
415 237 147 134 33 39 125 126 116 111 165 165 1 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
15 15 15 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue DMA device. */ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, struct device *dma_dev); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */
3 11 1 7 1 1 9 8 8 9 1 1 1 9 1 8 9 1 9 2 2 1 2 1 9 9 9 2 2 10 10 10 1 1 10 9 1 1 1 1 1 1 1 1 1 2 9 1 8 5 9 1 4 4 4 4 4 4 4 3 2 4 4 4 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 10 10 10 7 10 10 10 9 9 9 9 2 7 9 5 9 9 2 2 9 9 9 9 9 9 9 9 9 9 7 2 9 8 5 2 3 5 10 2 1 2 1 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 9 2 1 1 1 1 1 1 3 2 1 3 14 2 14 8 8 1 2 1 1 1 1 9 2 3 2 2 2 2 1 2 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 1993 by Theodore Ts'o. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/suspend.h> #include <linux/freezer.h> #include <linux/mutex.h> #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> #include <linux/falloc.h> #include <linux/uio.h> #include <linux/ioprio.h> #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/uaccess.h> #include <linux/blk-mq.h> #include <linux/spinlock.h> #include <uapi/linux/loop.h> /* Possible states of device */ enum { Lo_unbound, Lo_bound, Lo_rundown, Lo_deleting, }; struct loop_func_table; struct loop_device { int lo_number; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; char lo_file_name[LO_NAME_SIZE]; struct file * lo_backing_file; struct block_device *lo_device; gfp_t old_gfp_mask; spinlock_t lo_lock; int lo_state; spinlock_t lo_work_lock; struct workqueue_struct *workqueue; struct work_struct rootcg_work; struct list_head rootcg_cmd_list; struct list_head idle_worker_list; struct rb_root worker_tree; struct timer_list timer; bool use_dio; bool sysfs_inited; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; }; struct loop_cmd { struct list_head list_entry; bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; struct cgroup_subsys_state *blkcg_css; struct cgroup_subsys_state *memcg_css; }; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); /** * loop_global_lock_killable() - take locks for safe loop_validate_file() test * * @lo: struct loop_device * @global: true if @lo is about to bind another "struct loop_device", false otherwise * * Returns 0 on success, -EINTR otherwise. * * Since loop_validate_file() traverses on other "struct loop_device" if * is_loop_device() is true, we need a global lock for serializing concurrent * loop_configure()/loop_change_fd()/__loop_clr_fd() calls. */ static int loop_global_lock_killable(struct loop_device *lo, bool global) { int err; if (global) { err = mutex_lock_killable(&loop_validate_mutex); if (err) return err; } err = mutex_lock_killable(&lo->lo_mutex); if (err && global) mutex_unlock(&loop_validate_mutex); return err; } /** * loop_global_unlock() - release locks taken by loop_global_lock_killable() * * @lo: struct loop_device * @global: true if @lo was about to bind another "struct loop_device", false otherwise */ static void loop_global_unlock(struct loop_device *lo, bool global) { mutex_unlock(&lo->lo_mutex); if (global) mutex_unlock(&loop_validate_mutex); } static int max_part; static int part_shift; static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { loff_t loopsize; /* Compute loopsize in bytes */ loopsize = i_size_read(file->f_mapping->host); if (offset > 0) loopsize -= offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; if (sizelimit > 0 && sizelimit < loopsize) loopsize = sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. */ return loopsize >> 9; } static loff_t get_loop_size(struct loop_device *lo, struct file *file) { return get_size(lo->lo_offset, lo->lo_sizelimit, file); } /* * We support direct I/O only if lo_offset is aligned with the logical I/O size * of backing device, and the logical block size of loop is bigger than that of * the backing device. */ static bool lo_bdev_can_use_dio(struct loop_device *lo, struct block_device *backing_bdev) { unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); if (queue_logical_block_size(lo->lo_queue) < sb_bsize) return false; if (lo->lo_offset & (sb_bsize - 1)) return false; return true; } static void __loop_update_dio(struct loop_device *lo, bool dio) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct block_device *backing_bdev = NULL; bool use_dio; if (S_ISBLK(inode->i_mode)) backing_bdev = I_BDEV(inode); else if (inode->i_sb->s_bdev) backing_bdev = inode->i_sb->s_bdev; use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) && (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev)); if (lo->use_dio == use_dio) return; /* flush dirty pages before changing direct IO */ vfs_fsync(file, 0); /* * The flag of LO_FLAGS_DIRECT_IO is handled similarly with * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup * will get updated by ioctl(LOOP_GET_STATUS) */ if (lo->lo_state == Lo_bound) blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; if (use_dio) { blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags |= LO_FLAGS_DIRECT_IO; } else { blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; } if (lo->lo_state == Lo_bound) blk_mq_unfreeze_queue(lo->lo_queue); } /** * loop_set_size() - sets device size and notifies userspace * @lo: struct loop_device to set the size for * @size: new size of the loop device * * Callers must validate that the size passed into this function fits into * a sector_t, eg using loop_validate_size() */ static void loop_set_size(struct loop_device *lo, loff_t size) { if (!set_capacity_and_notify(lo->lo_disk, size)) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) { struct iov_iter i; ssize_t bw; iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); bw = vfs_iter_write(file, &i, ppos, 0); if (likely(bw == bvec->bv_len)) return 0; printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", (unsigned long long)*ppos, bvec->bv_len); if (bw >= 0) bw = -EIO; return bw; } static int lo_write_simple(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec; struct req_iterator iter; int ret = 0; rq_for_each_segment(bvec, rq, iter) { ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); if (ret < 0) break; cond_resched(); } return ret; } static int lo_read_simple(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec; struct req_iterator iter; struct iov_iter i; ssize_t len; rq_for_each_segment(bvec, rq, iter) { iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) return len; flush_dcache_page(bvec.bv_page); if (len != bvec.bv_len) { struct bio *bio; __rq_for_each_bio(bio, rq) zero_fill_bio(bio); break; } cond_resched(); } return 0; } static void loop_clear_limits(struct loop_device *lo, int mode) { struct queue_limits lim = queue_limits_start_update(lo->lo_queue); if (mode & FALLOC_FL_ZERO_RANGE) lim.max_write_zeroes_sectors = 0; if (mode & FALLOC_FL_PUNCH_HOLE) { lim.max_hw_discard_sectors = 0; lim.discard_granularity = 0; } queue_limits_commit_update(lo->lo_queue, &lim); } static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { /* * We use fallocate to manipulate the space mappings used by the image * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; int ret; mode |= FALLOC_FL_KEEP_SIZE; if (!bdev_max_discard_sectors(lo->lo_device)) return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) return -EIO; /* * We initially configure the limits in a hope that fallocate is * supported and clear them here if that turns out not to be true. */ if (unlikely(ret == -EOPNOTSUPP)) loop_clear_limits(lo, mode); return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; return ret; } static void lo_complete_rq(struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) ret = errno_to_blk_status(cmd->ret); goto end_io; } /* * Short READ - if we got some data, advance our request and * retry it. If we got no data, end the rest with EIO. */ if (cmd->ret) { blk_update_request(rq, BLK_STS_OK, cmd->ret); cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { if (cmd->use_aio) { struct bio *bio = rq->bio; while (bio) { zero_fill_bio(bio); bio = bio->bi_next; } } ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); } } static void lo_rw_aio_do_completion(struct loop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); if (!atomic_dec_and_test(&cmd->ref)) return; kfree(cmd->bvec); cmd->bvec = NULL; if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { struct iov_iter iter; struct req_iterator rq_iter; struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; int nr_bvec = 0; int ret; rq_for_each_bvec(tmp, rq, rq_iter) nr_bvec++; if (rq->bio != rq->biotail) { bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); if (!bvec) return -EIO; cmd->bvec = bvec; /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec * API will take care of all details for us. */ rq_for_each_bvec(tmp, rq, rq_iter) { *bvec = tmp; bvec++; } bvec = cmd->bvec; offset = 0; } else { /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ offset = bio->bi_iter.bi_bvec_done; bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); } atomic_set(&cmd->ref, 2); iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) ret = file->f_op->write_iter(&cmd->iocb, &iter); else ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return 0; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; /* * lo_write_simple and lo_read_simple should have been covered * by io submit style function like lo_rw_aio(), one blocker * is that lo_read_simple() need to call flush_dcache_page after * the page is written from kernel, and it isn't easy to handle * this in io submit style function which submits all segments * of the req at one time. And direct read IO doesn't need to * run flush_dcache_page(). */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); case REQ_OP_WRITE_ZEROES: /* * If the caller doesn't want deallocation, call zeroout to * write zeroes the range. Otherwise, punch them out. */ return lo_fallocate(lo, rq, pos, (rq->cmd_flags & REQ_NOUNMAP) ? FALLOC_FL_ZERO_RANGE : FALLOC_FL_PUNCH_HOLE); case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); else return lo_write_simple(lo, rq, pos); case REQ_OP_READ: if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, ITER_DEST); else return lo_read_simple(lo, rq, pos); default: WARN_ON_ONCE(1); return -EIO; } } static inline void loop_update_dio(struct loop_device *lo) { __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | lo->use_dio); } static void loop_reread_partitions(struct loop_device *lo) { int rc; mutex_lock(&lo->lo_disk->open_mutex); rc = bdev_disk_changed(lo->lo_disk, false); mutex_unlock(&lo->lo_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); } static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR; } static int loop_validate_file(struct file *file, struct block_device *bdev) { struct inode *inode = file->f_mapping->host; struct file *f = file; /* Avoid recursion */ while (is_loop_device(f)) { struct loop_device *l; lockdep_assert_held(&loop_validate_mutex); if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; if (l->lo_state != Lo_bound) return -EINVAL; /* Order wrt setting lo->lo_backing_file in loop_configure(). */ rmb(); f = l->lo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -EINVAL; return 0; } /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to * an alternative location for the content in case of server meltdown. * This can only work if the loop device is used read-only, and if the * new backing store is the same size and type as the old backing store. */ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { struct file *file = fget(arg); struct file *old_file; int error; bool partscan; bool is_loop; if (!file) return -EBADF; /* suppress uevents while reconfiguring the device */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) goto out_putf; error = -ENXIO; if (lo->lo_state != Lo_bound) goto out_err; /* the loop device has to be read-only */ error = -EINVAL; if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) goto out_err; error = loop_validate_file(file, bdev); if (error) goto out_err; old_file = lo->lo_backing_file; error = -EINVAL; /* size of the new backing store needs to be the same */ if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) goto out_err; /* and ... switch */ disk_force_media_change(lo->lo_disk); blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); mapping_set_gfp_mask(file->f_mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; loop_global_unlock(lo, is_loop); /* * Flush loop_validate_file() before fput(), for l->lo_backing_file * might be pointing at old_file which might be the last reference. */ if (!is_loop) { mutex_lock(&loop_validate_mutex); mutex_unlock(&loop_validate_mutex); } /* * We must drop file reference outside of lo_mutex as dropping * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); if (partscan) loop_reread_partitions(lo); error = 0; done: /* enable and uncork uevent now that we are done */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); goto done; } /* loop sysfs attributes */ static ssize_t loop_attr_show(struct device *dev, char *page, ssize_t (*callback)(struct loop_device *, char *)) { struct gendisk *disk = dev_to_disk(dev); struct loop_device *lo = disk->private_data; return callback(lo, page); } #define LOOP_ATTR_RO(_name) \ static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ static ssize_t loop_attr_do_show_##_name(struct device *d, \ struct device_attribute *attr, char *b) \ { \ return loop_attr_show(d, b, loop_attr_##_name##_show); \ } \ static struct device_attribute loop_attr_##_name = \ __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL); static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) { ssize_t ret; char *p = NULL; spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) ret = PTR_ERR(p); else { ret = strlen(p); memmove(buf, p, ret); buf[ret++] = '\n'; buf[ret] = 0; } return ret; } static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, NULL, }; static struct attribute_group loop_attribute_group = { .name = "loop", .attrs= loop_attrs, }; static void loop_sysfs_init(struct loop_device *lo) { lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_sysfs_exit(struct loop_device *lo) { if (lo->sysfs_inited) sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_config_discard(struct loop_device *lo, struct queue_limits *lim) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; u32 granularity = 0, max_discard_sectors = 0; struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing * capability. Set the discard sectors to the block device's zeroing * capabilities because loop discards result in blkdev_issue_zeroout(), * not blkdev_issue_discard(). This maintains consistent behavior with * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode)) { struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; granularity = bdev_discard_granularity(I_BDEV(inode)) ?: queue_physical_block_size(backingq); /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { max_discard_sectors = UINT_MAX >> 9; granularity = sbuf.f_bsize; } lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; if (max_discard_sectors) lim->discard_granularity = granularity; else lim->discard_granularity = 0; } struct loop_worker { struct rb_node rb_node; struct work_struct work; struct list_head cmd_list; struct list_head idle_list; struct loop_device *lo; struct cgroup_subsys_state *blkcg_css; unsigned long last_ran_at; }; static void loop_workfn(struct work_struct *work); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { return !css || css == blkcg_root_css; } #else static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { return !css; } #endif static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; spin_lock_irq(&lo->lo_work_lock); if (queue_on_root_worker(cmd->blkcg_css)) goto queue_work; node = &lo->worker_tree.rb_node; while (*node) { parent = *node; cur_worker = container_of(*node, struct loop_worker, rb_node); if (cur_worker->blkcg_css == cmd->blkcg_css) { worker = cur_worker; break; } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { node = &(*node)->rb_left; } else { node = &(*node)->rb_right; } } if (worker) goto queue_work; worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); /* * In the event we cannot allocate a worker, just queue on the * rootcg worker and issue the I/O as the rootcg */ if (!worker) { cmd->blkcg_css = NULL; if (cmd->memcg_css) css_put(cmd->memcg_css); cmd->memcg_css = NULL; goto queue_work; } worker->blkcg_css = cmd->blkcg_css; css_get(worker->blkcg_css); INIT_WORK(&worker->work, loop_workfn); INIT_LIST_HEAD(&worker->cmd_list); INIT_LIST_HEAD(&worker->idle_list); worker->lo = lo; rb_link_node(&worker->rb_node, parent, node); rb_insert_color(&worker->rb_node, &lo->worker_tree); queue_work: if (worker) { /* * We need to remove from the idle list here while * holding the lock so that the idle timer doesn't * free the worker */ if (!list_empty(&worker->idle_list)) list_del_init(&worker->idle_list); work = &worker->work; cmd_list = &worker->cmd_list; } else { work = &lo->rootcg_work; cmd_list = &lo->rootcg_cmd_list; } list_add_tail(&cmd->list_entry, cmd_list); queue_work(lo->workqueue, work); spin_unlock_irq(&lo->lo_work_lock); } static void loop_set_timer(struct loop_device *lo) { timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); } static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) { struct loop_worker *pos, *worker; spin_lock_irq(&lo->lo_work_lock); list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, idle_list) { if (!delete_all && time_is_after_jiffies(worker->last_ran_at + LOOP_IDLE_WORKER_TIMEOUT)) break; list_del(&worker->idle_list); rb_erase(&worker->rb_node, &lo->worker_tree); css_put(worker->blkcg_css); kfree(worker); } if (!list_empty(&lo->idle_worker_list)) loop_set_timer(lo); spin_unlock_irq(&lo->lo_work_lock); } static void loop_free_idle_workers_timer(struct timer_list *timer) { struct loop_device *lo = container_of(timer, struct loop_device, timer); return loop_free_idle_workers(lo, false); } static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; struct inode *file_inode = file->f_mapping->host; struct block_device *file_bdev = file_inode->i_sb->s_bdev; struct request_queue *q = lo->lo_queue; bool nonrot = true; /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ if (file_bdev) nonrot = bdev_nonrot(file_bdev); if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); } /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure * @info: struct loop_info64 to configure the device with * * Configures the loop device parameters according to the passed * in loop_info64 configuration. */ static int loop_set_status_from_info(struct loop_device *lo, const struct loop_info64 *info) { if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; switch (info->lo_encrypt_type) { case LO_CRYPT_NONE: break; case LO_CRYPT_XOR: pr_warn("support for the xor transformation has been removed.\n"); return -EINVAL; case LO_CRYPT_CRYPTOAPI: pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); return -EINVAL; default: return -EINVAL; } /* Avoid assigning overflow values */ if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) return -EOVERFLOW; lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; lo->lo_flags = info->lo_flags; return 0; } static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, bool update_discard_settings) { struct queue_limits lim; lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; if (update_discard_settings) loop_config_discard(lo, &lim); return queue_limits_commit_update(lo->lo_queue, &lim); } static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { struct file *file = fget(config->fd); struct inode *inode; struct address_space *mapping; int error; loff_t size; bool partscan; unsigned short bsize; bool is_loop; if (!file) return -EBADF; is_loop = is_loop_device(file); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ if (!(mode & BLK_OPEN_EXCL)) { error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; } error = loop_global_lock_killable(lo, is_loop); if (error) goto out_bdev; error = -EBUSY; if (lo->lo_state != Lo_unbound) goto out_unlock; error = loop_validate_file(file, bdev); if (error) goto out_unlock; mapping = file->f_mapping; inode = mapping->host; if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } if (config->block_size) { error = blk_validate_block_size(config->block_size); if (error) goto out_unlock; } error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; if (!lo->workqueue) { lo->workqueue = alloc_workqueue("loop%d", WQ_UNBOUND | WQ_FREEZABLE, 0, lo->lo_number); if (!lo->workqueue) { error = -ENOMEM; goto out_unlock; } } /* suppress uevents while reconfiguring the device */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); if (config->block_size) bsize = config->block_size; else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) /* In case of direct I/O, match underlying block size */ bsize = bdev_logical_block_size(inode->i_sb->s_bdev); else bsize = 512; error = loop_reconfigure_limits(lo, bsize, true); if (WARN_ON_ONCE(error)) goto out_unlock; loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); size = get_loop_size(lo, file); loop_set_size(lo, size); /* Order wrt reading lo_state in loop_validate_file(). */ wmb(); lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); /* enable and uncork uevent now that we are done */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); return 0; out_unlock: loop_global_unlock(lo, is_loop); out_bdev: if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } static void __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); /* * Freeze the request queue when unbinding on a live file descriptor and * thus an open device. When called from ->release we are guaranteed * that there is no I/O in progress already. */ if (!release) blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); lo->lo_device = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); loop_reconfigure_limits(lo, 512, false); invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); if (!release) blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk); if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; /* * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt * must be at least one and it can only become zero when the * current holder is released. */ if (!release) mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); if (!release) mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ } /* * lo->lo_state is set to Lo_unbound here after above partscan has * finished. There cannot be anybody else entering __loop_clr_fd() as * Lo_rundown state protects us from all the other places trying to * change the 'lo' device. */ lo->lo_flags = 0; if (!part_shift) set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); /* * Need not hold lo_mutex to fput backing file. Calling fput holding * lo_mutex triggers a circular lock dependency possibility warning as * fput can take open_mutex which is usually taken before lo_mutex. */ fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; /* * Since lo_ioctl() is called without locks held, it is possible that * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. * * Therefore, use global lock when setting Lo_rundown state in order to * make sure that loop_validate_file() will fail if the "struct file" * which loop_configure()/loop_change_fd() found via fget() was this * loop device. */ err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { loop_global_unlock(lo, true); return -ENXIO; } /* * If we've explicitly asked to tear down the loop device, * and it has an elevated reference count, set it for auto-teardown when * the last reference goes away. This stops $!~#$@ udev from * preventing teardown because it decided that it needs to run blkid on * the loopback device whenever they appear. xfstests is notorious for * failing tests because blkid via udev races with a losetup * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; int prev_lo_flags; bool partscan = false; bool size_changed = false; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; } if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { size_changed = true; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); } /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); prev_lo_flags = lo->lo_flags; err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; /* Mask out flags that can't be set using LOOP_SET_STATUS. */ lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; /* For those flags, use the previous values instead */ lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; /* For flags that can't be cleared, use previous values too */ lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; if (size_changed) { loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, lo->lo_backing_file); loop_set_size(lo, new_size); } /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); partscan = true; } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) loop_reread_partitions(lo); return err; } static int loop_get_status(struct loop_device *lo, struct loop_info64 *info) { struct path path; struct kstat stat; int ret; ret = mutex_lock_killable(&lo->lo_mutex); if (ret) return ret; if (lo->lo_state != Lo_bound) { mutex_unlock(&lo->lo_mutex); return -ENXIO; } memset(info, 0, sizeof(*info)); info->lo_number = lo->lo_number; info->lo_offset = lo->lo_offset; info->lo_sizelimit = lo->lo_sizelimit; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); mutex_unlock(&lo->lo_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); info->lo_inode = stat.ino; info->lo_rdevice = huge_encode_dev(stat.rdev); } path_put(&path); return ret; } static void loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) { memset(info64, 0, sizeof(*info64)); info64->lo_number = info->lo_number; info64->lo_device = info->lo_device; info64->lo_inode = info->lo_inode; info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; info64->lo_flags = info->lo_flags; memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); } static int loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) { memset(info, 0, sizeof(*info)); info->lo_number = info64->lo_number; info->lo_device = info64->lo_device; info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; info->lo_flags = info64->lo_flags; memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info->lo_device != info64->lo_device || info->lo_rdevice != info64->lo_rdevice || info->lo_inode != info64->lo_inode || info->lo_offset != info64->lo_offset) return -EOVERFLOW; return 0; } static int loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; if (copy_from_user(&info, arg, sizeof (struct loop_info))) return -EFAULT; loop_info64_from_old(&info, &info64); return loop_set_status(lo, &info64); } static int loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) { struct loop_info64 info64; if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) return -EFAULT; return loop_set_status(lo, &info64); } static int loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_old(&info64, &info); if (!err && copy_to_user(arg, &info, sizeof(info))) err = -EFAULT; return err; } static int loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err && copy_to_user(arg, &info64, sizeof(info64))) err = -EFAULT; return err; } static int loop_set_capacity(struct loop_device *lo) { loff_t size; if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; size = get_loop_size(lo, lo->lo_backing_file); loop_set_size(lo, size); return 0; } static int loop_set_dio(struct loop_device *lo, unsigned long arg) { int error = -ENXIO; if (lo->lo_state != Lo_bound) goto out; __loop_update_dio(lo, !!arg); if (lo->use_dio == !!arg) return 0; error = -EINVAL; out: return error; } static int loop_set_block_size(struct loop_device *lo, unsigned long arg) { int err = 0; if (lo->lo_state != Lo_bound) return -ENXIO; err = blk_validate_block_size(arg); if (err) return err; if (lo->lo_queue->limits.logical_block_size == arg) return 0; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); err = loop_reconfigure_limits(lo, arg, false); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); return err; } static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, unsigned long arg) { int err; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; switch (cmd) { case LOOP_SET_CAPACITY: err = loop_set_capacity(lo); break; case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; case LOOP_SET_BLOCK_SIZE: err = loop_set_block_size(lo, arg); break; default: err = -EINVAL; } mutex_unlock(&lo->lo_mutex); return err; } static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; void __user *argp = (void __user *) arg; int err; switch (cmd) { case LOOP_SET_FD: { /* * Legacy case - pass in a zeroed out struct loop_config with * only the file descriptor set , which corresponds with the * default parameters we'd have used otherwise. */ struct loop_config config; memset(&config, 0, sizeof(config)); config.fd = arg; return loop_configure(lo, mode, bdev, &config); } case LOOP_CONFIGURE: { struct loop_config config; if (copy_from_user(&config, argp, sizeof(config))) return -EFAULT; return loop_configure(lo, mode, bdev, &config); } case LOOP_CHANGE_FD: return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status_old(lo, argp); break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status64(lo, argp); break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: case LOOP_SET_BLOCK_SIZE: if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; default: err = lo_simple_ioctl(lo, cmd, arg); break; } return err; } #ifdef CONFIG_COMPAT struct compat_loop_info { compat_int_t lo_number; /* ioctl r/o */ compat_dev_t lo_device; /* ioctl r/o */ compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ compat_ulong_t lo_init[2]; char reserved[4]; }; /* * Transfer 32-bit compatibility structure in userspace to 64-bit loop info * - noinlined to reduce stack space usage in main part of driver */ static noinline int loop_info64_from_compat(const struct compat_loop_info __user *arg, struct loop_info64 *info64) { struct compat_loop_info info; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; memset(info64, 0, sizeof(*info64)); info64->lo_number = info.lo_number; info64->lo_device = info.lo_device; info64->lo_inode = info.lo_inode; info64->lo_rdevice = info.lo_rdevice; info64->lo_offset = info.lo_offset; info64->lo_sizelimit = 0; info64->lo_flags = info.lo_flags; memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); return 0; } /* * Transfer 64-bit loop info to 32-bit compatibility structure in userspace * - noinlined to reduce stack space usage in main part of driver */ static noinline int loop_info64_to_compat(const struct loop_info64 *info64, struct compat_loop_info __user *arg) { struct compat_loop_info info; memset(&info, 0, sizeof(info)); info.lo_number = info64->lo_number; info.lo_device = info64->lo_device; info.lo_inode = info64->lo_inode; info.lo_rdevice = info64->lo_rdevice; info.lo_offset = info64->lo_offset; info.lo_flags = info64->lo_flags; memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info.lo_device != info64->lo_device || info.lo_rdevice != info64->lo_rdevice || info.lo_inode != info64->lo_inode || info.lo_offset != info64->lo_offset) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } static int loop_set_status_compat(struct loop_device *lo, const struct compat_loop_info __user *arg) { struct loop_info64 info64; int ret; ret = loop_info64_from_compat(arg, &info64); if (ret < 0) return ret; return loop_set_status(lo, &info64); } static int loop_get_status_compat(struct loop_device *lo, struct compat_loop_info __user *arg) { struct loop_info64 info64; int err; if (!arg) return -EINVAL; err = loop_get_status(lo, &info64); if (!err) err = loop_info64_to_compat(&info64, arg); return err; } static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; int err; switch(cmd) { case LOOP_SET_STATUS: err = loop_set_status_compat(lo, (const struct compat_loop_info __user *)arg); break; case LOOP_GET_STATUS: err = loop_get_status_compat(lo, (struct compat_loop_info __user *)arg); break; case LOOP_SET_CAPACITY: case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: case LOOP_CONFIGURE: arg = (unsigned long) compat_ptr(arg); fallthrough; case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: case LOOP_SET_DIRECT_IO: err = lo_ioctl(bdev, mode, cmd, arg); break; default: err = -ENOIOCTLCMD; break; } return err; } #endif static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; if (disk_openers(disk) > 0) return; mutex_lock(&lo->lo_mutex); if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* * In autoclear mode, stop the loop thread * and remove configuration after last close. */ __loop_clr_fd(lo, true); return; } mutex_unlock(&lo->lo_mutex); } static void lo_free_disk(struct gendisk *disk) { struct loop_device *lo = disk->private_data; if (lo->workqueue) destroy_workqueue(lo->workqueue); loop_free_idle_workers(lo, true); timer_shutdown_sync(&lo->timer); mutex_destroy(&lo->lo_mutex); kfree(lo); } static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif .free_disk = lo_free_disk, }; /* * And now the modules code and kernel interface. */ /* * If max_loop is specified, create that many devices upfront. * This also becomes a hard limit. If max_loop is not specified, * the default isn't a hard limit (as before commit 85c50197716c * changed the default value from 0 for max_loop=0 reasons), just * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module * init time. Loop devices can be requested on-demand with the * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static bool max_loop_specified; static int max_loop_param_set_int(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (ret < 0) return ret; max_loop_specified = true; return 0; } static const struct kernel_param_ops max_loop_param_ops = { .set = max_loop_param_set_int, .get = param_get_int, }; module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); #else module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); #endif module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { int qd, ret; ret = kstrtoint(s, 0, &qd); if (ret < 0) return ret; if (qd < 1) return -EINVAL; hw_queue_depth = qd; return 0; } static const struct kernel_param_ops loop_hw_qdepth_param_ops = { .set = loop_set_hw_queue_depth, .get = param_get_int, }; device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct loop_device *lo = rq->q->queuedata; blk_mq_start_request(rq); if (lo->lo_state != Lo_bound) return BLK_STS_IOERR; switch (req_op(rq)) { case REQ_OP_FLUSH: case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: cmd->use_aio = false; break; default: cmd->use_aio = lo->use_dio; break; } /* always use the first bio's css */ cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP if (rq->bio) { cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG if (cmd->blkcg_css) { cmd->memcg_css = cgroup_get_e_css(cmd->blkcg_css->cgroup, &memory_cgrp_subsys); } #endif } #endif loop_queue_work(lo, cmd); return BLK_STS_OK; } static void loop_handle_cmd(struct loop_cmd *cmd) { struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; int ret = 0; struct mem_cgroup *old_memcg = NULL; const bool use_aio = cmd->use_aio; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } if (cmd_blkcg_css) kthread_associate_blkcg(cmd_blkcg_css); if (cmd_memcg_css) old_memcg = set_active_memcg( mem_cgroup_from_css(cmd_memcg_css)); /* * do_req_filebacked() may call blk_mq_complete_request() synchronously * or asynchronously if using aio. Hence, do not touch 'cmd' after * do_req_filebacked() has returned unless we are sure that 'cmd' has * not yet been completed. */ ret = do_req_filebacked(lo, rq); if (cmd_blkcg_css) kthread_associate_blkcg(NULL); if (cmd_memcg_css) { set_active_memcg(old_memcg); css_put(cmd_memcg_css); } failed: /* complete non-aio request */ if (!use_aio || ret) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else cmd->ret = ret ? -EIO : 0; if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } } static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { int orig_flags = current->flags; struct loop_cmd *cmd; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; spin_lock_irq(&lo->lo_work_lock); while (!list_empty(cmd_list)) { cmd = container_of( cmd_list->next, struct loop_cmd, list_entry); list_del(cmd_list->next); spin_unlock_irq(&lo->lo_work_lock); loop_handle_cmd(cmd); cond_resched(); spin_lock_irq(&lo->lo_work_lock); } /* * We only add to the idle list if there are no pending cmds * *and* the worker will not run again which ensures that it * is safe to free any worker on the idle list */ if (worker && !work_pending(&worker->work)) { worker->last_ran_at = jiffies; list_add_tail(&worker->idle_list, &lo->idle_worker_list); loop_set_timer(lo); } spin_unlock_irq(&lo->lo_work_lock); current->flags = orig_flags; } static void loop_workfn(struct work_struct *work) { struct loop_worker *worker = container_of(work, struct loop_worker, work); loop_process_work(worker, &worker->cmd_list, worker->lo); } static void loop_rootcg_workfn(struct work_struct *work) { struct loop_device *lo = container_of(work, struct loop_device, rootcg_work); loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, }; static int loop_add(int i) { struct queue_limits lim = { /* * Random number picked from the historic block max_sectors cap. */ .max_hw_sectors = 2560u, }; struct loop_device *lo; struct gendisk *disk; int err; err = -ENOMEM; lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; lo->worker_tree = RB_ROOT; INIT_LIST_HEAD(&lo->idle_worker_list); timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); if (err) goto out_free_dev; /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); if (err == -ENOSPC) err = -EEXIST; } else { err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } mutex_unlock(&loop_ctl_mutex); if (err < 0) goto out_free_dev; i = err; lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; /* * By default, we do buffer IO, so it doesn't make sense to enable * merge because the I/O submitted to backing file is handled page by * page. For directio mode, merge does help to dispatch bigger request * to underlayer disk. We will enable merge once directio is enabled. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its * setup. Userspace can always add and remove partitions from all * devices. The needed partition minors are allocated from the * extended minor space, the main loop device numbers will continue * to match the loop minors, regardless of the number of partitions * used. * * If max_part is given, partition scanning is globally enabled for * all loop devices. The minors for the main loop devices will be * multiples of max_part. * * Note: Global-for-all-devices, set-only-at-init, read-only module * parameteters like 'max_loop' and 'max_part' make things needlessly * complicated, are too static, inflexible and may surprise * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); /* Make this loop device reachable from pathname. */ err = add_disk(disk); if (err) goto out_cleanup_disk; /* Show this loop device. */ mutex_lock(&loop_ctl_mutex); lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); return i; out_cleanup_disk: put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, i); mutex_unlock(&loop_ctl_mutex); out_free_dev: kfree(lo); out: return err; } static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); put_disk(lo->lo_disk); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; if (max_loop_specified && max_loop && idx >= max_loop) return; loop_add(idx); } #else #define loop_probe NULL #endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ static int loop_control_remove(int idx) { struct loop_device *lo; int ret; if (idx < 0) { pr_warn_once("deleting an unspecified loop device is not supported.\n"); return -EINVAL; } /* Hide this loop device for serialization. */ ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; lo = idr_find(&loop_index_idr, idx); if (!lo || !lo->idr_visible) ret = -ENODEV; else lo->idr_visible = false; mutex_unlock(&loop_ctl_mutex); if (ret) return ret; /* Check whether this loop device can be removed. */ ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); loop_remove(lo); return 0; mark_visible: /* Show this loop device again. */ mutex_lock(&loop_ctl_mutex); lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); return ret; } static int loop_control_get_free(int idx) { struct loop_device *lo; int id, ret; ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; idr_for_each_entry(&loop_index_idr, lo, id) { /* Hitting a race results in creating a new loop device which is harmless. */ if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound) goto found; } mutex_unlock(&loop_ctl_mutex); return loop_add(-1); found: mutex_unlock(&loop_ctl_mutex); return id; } static long loop_control_ioctl(struct file *file, unsigned int cmd, unsigned long parm) { switch (cmd) { case LOOP_CTL_ADD: return loop_add(parm); case LOOP_CTL_REMOVE: return loop_control_remove(parm); case LOOP_CTL_GET_FREE: return loop_control_get_free(parm); default: return -ENOSYS; } } static const struct file_operations loop_ctl_fops = { .open = nonseekable_open, .unlocked_ioctl = loop_control_ioctl, .compat_ioctl = loop_control_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice loop_misc = { .minor = LOOP_CTRL_MINOR, .name = "loop-control", .fops = &loop_ctl_fops, }; MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR); MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { int i; int err; part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); /* * Adjust max_part according to part_shift as it is exported * to user space so that user can decide correct minor number * if [s]he want to create more devices. * * Note that -1 is required because partition 0 is reserved * for the whole disk. */ max_part = (1UL << part_shift) - 1; } if ((1UL << part_shift) > DISK_MAX_PARTS) { err = -EINVAL; goto err_out; } if (max_loop > 1UL << (MINORBITS - part_shift)) { err = -EINVAL; goto err_out; } err = misc_register(&loop_misc); if (err < 0) goto err_out; if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) { err = -EIO; goto misc_out; } /* pre-create number of devices given by config or max_loop */ for (i = 0; i < max_loop; i++) loop_add(i); printk(KERN_INFO "loop: module loaded\n"); return 0; misc_out: misc_deregister(&loop_misc); err_out: return err; } static void __exit loop_exit(void) { struct loop_device *lo; int id; unregister_blkdev(LOOP_MAJOR, "loop"); misc_deregister(&loop_misc); /* * There is no need to use loop_ctl_mutex here, for nobody else can * access loop_index_idr when this module is unloading (unless forced * module unloading is requested). If this is not a clean unloading, * we have no means to avoid kernel crash. */ idr_for_each_entry(&loop_index_idr, lo, id) loop_remove(lo); idr_destroy(&loop_index_idr); } module_init(loop_init); module_exit(loop_exit); #ifndef MODULE static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD max_loop_specified = true; #endif return 1; } __setup("max_loop=", max_loop_setup); #endif
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 // SPDX-License-Identifier: GPL-2.0 /* * The USB Monitor, inspired by Dave Harding's USBMon. * * This is a binary format reader. * * Copyright (C) 2006 Paolo Abeni (paolo.abeni@email.it) * Copyright (C) 2006,2007 Pete Zaitcev (zaitcev@redhat.com) */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/fs.h> #include <linux/cdev.h> #include <linux/export.h> #include <linux/usb.h> #include <linux/poll.h> #include <linux/compat.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/time64.h> #include <linux/uaccess.h> #include "usb_mon.h" /* * Defined by USB 2.0 clause 9.3, table 9.2. */ #define SETUP_LEN 8 /* ioctl macros */ #define MON_IOC_MAGIC 0x92 #define MON_IOCQ_URB_LEN _IO(MON_IOC_MAGIC, 1) /* #2 used to be MON_IOCX_URB, removed before it got into Linus tree */ #define MON_IOCG_STATS _IOR(MON_IOC_MAGIC, 3, struct mon_bin_stats) #define MON_IOCT_RING_SIZE _IO(MON_IOC_MAGIC, 4) #define MON_IOCQ_RING_SIZE _IO(MON_IOC_MAGIC, 5) #define MON_IOCX_GET _IOW(MON_IOC_MAGIC, 6, struct mon_bin_get) #define MON_IOCX_MFETCH _IOWR(MON_IOC_MAGIC, 7, struct mon_bin_mfetch) #define MON_IOCH_MFLUSH _IO(MON_IOC_MAGIC, 8) /* #9 was MON_IOCT_SETAPI */ #define MON_IOCX_GETX _IOW(MON_IOC_MAGIC, 10, struct mon_bin_get) #ifdef CONFIG_COMPAT #define MON_IOCX_GET32 _IOW(MON_IOC_MAGIC, 6, struct mon_bin_get32) #define MON_IOCX_MFETCH32 _IOWR(MON_IOC_MAGIC, 7, struct mon_bin_mfetch32) #define MON_IOCX_GETX32 _IOW(MON_IOC_MAGIC, 10, struct mon_bin_get32) #endif /* * Some architectures have enormous basic pages (16KB for ia64, 64KB for ppc). * But it's all right. Just use a simple way to make sure the chunk is never * smaller than a page. * * N.B. An application does not know our chunk size. * * Woops, get_zeroed_page() returns a single page. I guess we're stuck with * page-sized chunks for the time being. */ #define CHUNK_SIZE PAGE_SIZE #define CHUNK_ALIGN(x) (((x)+CHUNK_SIZE-1) & ~(CHUNK_SIZE-1)) /* * The magic limit was calculated so that it allows the monitoring * application to pick data once in two ticks. This way, another application, * which presumably drives the bus, gets to hog CPU, yet we collect our data. * If HZ is 100, a 480 mbit/s bus drives 614 KB every jiffy. USB has an * enormous overhead built into the bus protocol, so we need about 1000 KB. * * This is still too much for most cases, where we just snoop a few * descriptor fetches for enumeration. So, the default is a "reasonable" * amount for systems with HZ=250 and incomplete bus saturation. * * XXX What about multi-megabyte URBs which take minutes to transfer? */ #define BUFF_MAX CHUNK_ALIGN(1200*1024) #define BUFF_DFL CHUNK_ALIGN(300*1024) #define BUFF_MIN CHUNK_ALIGN(8*1024) /* * The per-event API header (2 per URB). * * This structure is seen in userland as defined by the documentation. */ struct mon_bin_hdr { u64 id; /* URB ID - from submission to callback */ unsigned char type; /* Same as in text API; extensible. */ unsigned char xfer_type; /* ISO, Intr, Control, Bulk */ unsigned char epnum; /* Endpoint number and transfer direction */ unsigned char devnum; /* Device address */ unsigned short busnum; /* Bus number */ char flag_setup; char flag_data; s64 ts_sec; /* ktime_get_real_ts64 */ s32 ts_usec; /* ktime_get_real_ts64 */ int status; unsigned int len_urb; /* Length of data (submitted or actual) */ unsigned int len_cap; /* Delivered length */ union { unsigned char setup[SETUP_LEN]; /* Only for Control S-type */ struct iso_rec { int error_count; int numdesc; } iso; } s; int interval; int start_frame; unsigned int xfer_flags; unsigned int ndesc; /* Actual number of ISO descriptors */ }; /* * ISO vector, packed into the head of data stream. * This has to take 16 bytes to make sure that the end of buffer * wrap is not happening in the middle of a descriptor. */ struct mon_bin_isodesc { int iso_status; unsigned int iso_off; unsigned int iso_len; u32 _pad; }; /* per file statistic */ struct mon_bin_stats { u32 queued; u32 dropped; }; struct mon_bin_get { struct mon_bin_hdr __user *hdr; /* Can be 48 bytes or 64. */ void __user *data; size_t alloc; /* Length of data (can be zero) */ }; struct mon_bin_mfetch { u32 __user *offvec; /* Vector of events fetched */ u32 nfetch; /* Number of events to fetch (out: fetched) */ u32 nflush; /* Number of events to flush */ }; #ifdef CONFIG_COMPAT struct mon_bin_get32 { u32 hdr32; u32 data32; u32 alloc32; }; struct mon_bin_mfetch32 { u32 offvec32; u32 nfetch32; u32 nflush32; }; #endif /* Having these two values same prevents wrapping of the mon_bin_hdr */ #define PKT_ALIGN 64 #define PKT_SIZE 64 #define PKT_SZ_API0 48 /* API 0 (2.6.20) size */ #define PKT_SZ_API1 64 /* API 1 size: extra fields */ #define ISODESC_MAX 128 /* Same number as usbfs allows, 2048 bytes. */ /* max number of USB bus supported */ #define MON_BIN_MAX_MINOR 128 /* * The buffer: map of used pages. */ struct mon_pgmap { struct page *pg; unsigned char *ptr; /* XXX just use page_to_virt everywhere? */ }; /* * This gets associated with an open file struct. */ struct mon_reader_bin { /* The buffer: one per open. */ spinlock_t b_lock; /* Protect b_cnt, b_in */ unsigned int b_size; /* Current size of the buffer - bytes */ unsigned int b_cnt; /* Bytes used */ unsigned int b_in, b_out; /* Offsets into buffer - bytes */ unsigned int b_read; /* Amount of read data in curr. pkt. */ struct mon_pgmap *b_vec; /* The map array */ wait_queue_head_t b_wait; /* Wait for data here */ struct mutex fetch_lock; /* Protect b_read, b_out */ int mmap_active; /* A list of these is needed for "bus 0". Some time later. */ struct mon_reader r; /* Stats */ unsigned int cnt_lost; }; static inline struct mon_bin_hdr *MON_OFF2HDR(const struct mon_reader_bin *rp, unsigned int offset) { return (struct mon_bin_hdr *) (rp->b_vec[offset / CHUNK_SIZE].ptr + offset % CHUNK_SIZE); } #define MON_RING_EMPTY(rp) ((rp)->b_cnt == 0) static unsigned char xfer_to_pipe[4] = { PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT }; static const struct class mon_bin_class = { .name = "usbmon", }; static dev_t mon_bin_dev0; static struct cdev mon_bin_cdev; static void mon_buff_area_fill(const struct mon_reader_bin *rp, unsigned int offset, unsigned int size); static int mon_bin_wait_event(struct file *file, struct mon_reader_bin *rp); static int mon_alloc_buff(struct mon_pgmap *map, int npages); static void mon_free_buff(struct mon_pgmap *map, int npages); /* * This is a "chunked memcpy". It does not manipulate any counters. */ static unsigned int mon_copy_to_buff(const struct mon_reader_bin *this, unsigned int off, const unsigned char *from, unsigned int length) { unsigned int step_len; unsigned char *buf; unsigned int in_page; while (length) { /* * Determine step_len. */ step_len = length; in_page = CHUNK_SIZE - (off & (CHUNK_SIZE-1)); if (in_page < step_len) step_len = in_page; /* * Copy data and advance pointers. */ buf = this->b_vec[off / CHUNK_SIZE].ptr + off % CHUNK_SIZE; memcpy(buf, from, step_len); if ((off += step_len) >= this->b_size) off = 0; from += step_len; length -= step_len; } return off; } /* * This is a little worse than the above because it's "chunked copy_to_user". * The return value is an error code, not an offset. */ static int copy_from_buf(const struct mon_reader_bin *this, unsigned int off, char __user *to, int length) { unsigned int step_len; unsigned char *buf; unsigned int in_page; while (length) { /* * Determine step_len. */ step_len = length; in_page = CHUNK_SIZE - (off & (CHUNK_SIZE-1)); if (in_page < step_len) step_len = in_page; /* * Copy data and advance pointers. */ buf = this->b_vec[off / CHUNK_SIZE].ptr + off % CHUNK_SIZE; if (copy_to_user(to, buf, step_len)) return -EINVAL; if ((off += step_len) >= this->b_size) off = 0; to += step_len; length -= step_len; } return 0; } /* * Allocate an (aligned) area in the buffer. * This is called under b_lock. * Returns ~0 on failure. */ static unsigned int mon_buff_area_alloc(struct mon_reader_bin *rp, unsigned int size) { unsigned int offset; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if (rp->b_cnt + size > rp->b_size) return ~0; offset = rp->b_in; rp->b_cnt += size; if ((rp->b_in += size) >= rp->b_size) rp->b_in -= rp->b_size; return offset; } /* * This is the same thing as mon_buff_area_alloc, only it does not allow * buffers to wrap. This is needed by applications which pass references * into mmap-ed buffers up their stacks (libpcap can do that). * * Currently, we always have the header stuck with the data, although * it is not strictly speaking necessary. * * When a buffer would wrap, we place a filler packet to mark the space. */ static unsigned int mon_buff_area_alloc_contiguous(struct mon_reader_bin *rp, unsigned int size) { unsigned int offset; unsigned int fill_size; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if (rp->b_cnt + size > rp->b_size) return ~0; if (rp->b_in + size > rp->b_size) { /* * This would wrap. Find if we still have space after * skipping to the end of the buffer. If we do, place * a filler packet and allocate a new packet. */ fill_size = rp->b_size - rp->b_in; if (rp->b_cnt + size + fill_size > rp->b_size) return ~0; mon_buff_area_fill(rp, rp->b_in, fill_size); offset = 0; rp->b_in = size; rp->b_cnt += size + fill_size; } else if (rp->b_in + size == rp->b_size) { offset = rp->b_in; rp->b_in = 0; rp->b_cnt += size; } else { offset = rp->b_in; rp->b_in += size; rp->b_cnt += size; } return offset; } /* * Return a few (kilo-)bytes to the head of the buffer. * This is used if a data fetch fails. */ static void mon_buff_area_shrink(struct mon_reader_bin *rp, unsigned int size) { /* size &= ~(PKT_ALIGN-1); -- we're called with aligned size */ rp->b_cnt -= size; if (rp->b_in < size) rp->b_in += rp->b_size; rp->b_in -= size; } /* * This has to be called under both b_lock and fetch_lock, because * it accesses both b_cnt and b_out. */ static void mon_buff_area_free(struct mon_reader_bin *rp, unsigned int size) { size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); rp->b_cnt -= size; if ((rp->b_out += size) >= rp->b_size) rp->b_out -= rp->b_size; } static void mon_buff_area_fill(const struct mon_reader_bin *rp, unsigned int offset, unsigned int size) { struct mon_bin_hdr *ep; ep = MON_OFF2HDR(rp, offset); memset(ep, 0, PKT_SIZE); ep->type = '@'; ep->len_cap = size - PKT_SIZE; } static inline char mon_bin_get_setup(unsigned char *setupb, const struct urb *urb, char ev_type) { if (urb->setup_packet == NULL) return 'Z'; memcpy(setupb, urb->setup_packet, SETUP_LEN); return 0; } static unsigned int mon_bin_get_data(const struct mon_reader_bin *rp, unsigned int offset, struct urb *urb, unsigned int length, char *flag) { int i; struct scatterlist *sg; unsigned int this_len; *flag = 0; if (urb->num_sgs == 0) { if (urb->transfer_buffer == NULL) { *flag = 'Z'; return length; } mon_copy_to_buff(rp, offset, urb->transfer_buffer, length); length = 0; } else { /* If IOMMU coalescing occurred, we cannot trust sg_page */ if (urb->transfer_flags & URB_DMA_SG_COMBINED) { *flag = 'D'; return length; } /* Copy up to the first non-addressable segment */ for_each_sg(urb->sg, sg, urb->num_sgs, i) { if (length == 0 || PageHighMem(sg_page(sg))) break; this_len = min_t(unsigned int, sg->length, length); offset = mon_copy_to_buff(rp, offset, sg_virt(sg), this_len); length -= this_len; } if (i == 0) *flag = 'D'; } return length; } /* * This is the look-ahead pass in case of 'C Zi', when actual_length cannot * be used to determine the length of the whole contiguous buffer. */ static unsigned int mon_bin_collate_isodesc(const struct mon_reader_bin *rp, struct urb *urb, unsigned int ndesc) { struct usb_iso_packet_descriptor *fp; unsigned int length; length = 0; fp = urb->iso_frame_desc; while (ndesc-- != 0) { if (fp->actual_length != 0) { if (fp->offset + fp->actual_length > length) length = fp->offset + fp->actual_length; } fp++; } return length; } static void mon_bin_get_isodesc(const struct mon_reader_bin *rp, unsigned int offset, struct urb *urb, char ev_type, unsigned int ndesc) { struct mon_bin_isodesc *dp; struct usb_iso_packet_descriptor *fp; fp = urb->iso_frame_desc; while (ndesc-- != 0) { dp = (struct mon_bin_isodesc *) (rp->b_vec[offset / CHUNK_SIZE].ptr + offset % CHUNK_SIZE); dp->iso_status = fp->status; dp->iso_off = fp->offset; dp->iso_len = (ev_type == 'S') ? fp->length : fp->actual_length; dp->_pad = 0; if ((offset += sizeof(struct mon_bin_isodesc)) >= rp->b_size) offset = 0; fp++; } } static void mon_bin_event(struct mon_reader_bin *rp, struct urb *urb, char ev_type, int status) { const struct usb_endpoint_descriptor *epd = &urb->ep->desc; struct timespec64 ts; unsigned long flags; unsigned int urb_length; unsigned int offset; unsigned int length; unsigned int delta; unsigned int ndesc, lendesc; unsigned char dir; struct mon_bin_hdr *ep; char data_tag = 0; ktime_get_real_ts64(&ts); spin_lock_irqsave(&rp->b_lock, flags); /* * Find the maximum allowable length, then allocate space. */ urb_length = (ev_type == 'S') ? urb->transfer_buffer_length : urb->actual_length; length = urb_length; if (usb_endpoint_xfer_isoc(epd)) { if (urb->number_of_packets < 0) { ndesc = 0; } else if (urb->number_of_packets >= ISODESC_MAX) { ndesc = ISODESC_MAX; } else { ndesc = urb->number_of_packets; } if (ev_type == 'C' && usb_urb_dir_in(urb)) length = mon_bin_collate_isodesc(rp, urb, ndesc); } else { ndesc = 0; } lendesc = ndesc*sizeof(struct mon_bin_isodesc); /* not an issue unless there's a subtle bug in a HCD somewhere */ if (length >= urb->transfer_buffer_length) length = urb->transfer_buffer_length; if (length >= rp->b_size/5) length = rp->b_size/5; if (usb_urb_dir_in(urb)) { if (ev_type == 'S') { length = 0; data_tag = '<'; } /* Cannot rely on endpoint number in case of control ep.0 */ dir = USB_DIR_IN; } else { if (ev_type == 'C') { length = 0; data_tag = '>'; } dir = 0; } if (rp->mmap_active) { offset = mon_buff_area_alloc_contiguous(rp, length + PKT_SIZE + lendesc); } else { offset = mon_buff_area_alloc(rp, length + PKT_SIZE + lendesc); } if (offset == ~0) { rp->cnt_lost++; spin_unlock_irqrestore(&rp->b_lock, flags); return; } ep = MON_OFF2HDR(rp, offset); if ((offset += PKT_SIZE) >= rp->b_size) offset = 0; /* * Fill the allocated area. */ memset(ep, 0, PKT_SIZE); ep->type = ev_type; ep->xfer_type = xfer_to_pipe[usb_endpoint_type(epd)]; ep->epnum = dir | usb_endpoint_num(epd); ep->devnum = urb->dev->devnum; ep->busnum = urb->dev->bus->busnum; ep->id = (unsigned long) urb; ep->ts_sec = ts.tv_sec; ep->ts_usec = ts.tv_nsec / NSEC_PER_USEC; ep->status = status; ep->len_urb = urb_length; ep->len_cap = length + lendesc; ep->xfer_flags = urb->transfer_flags; if (usb_endpoint_xfer_int(epd)) { ep->interval = urb->interval; } else if (usb_endpoint_xfer_isoc(epd)) { ep->interval = urb->interval; ep->start_frame = urb->start_frame; ep->s.iso.error_count = urb->error_count; ep->s.iso.numdesc = urb->number_of_packets; } if (usb_endpoint_xfer_control(epd) && ev_type == 'S') { ep->flag_setup = mon_bin_get_setup(ep->s.setup, urb, ev_type); } else { ep->flag_setup = '-'; } if (ndesc != 0) { ep->ndesc = ndesc; mon_bin_get_isodesc(rp, offset, urb, ev_type, ndesc); if ((offset += lendesc) >= rp->b_size) offset -= rp->b_size; } if (length != 0) { length = mon_bin_get_data(rp, offset, urb, length, &ep->flag_data); if (length > 0) { delta = (ep->len_cap + PKT_ALIGN-1) & ~(PKT_ALIGN-1); ep->len_cap -= length; delta -= (ep->len_cap + PKT_ALIGN-1) & ~(PKT_ALIGN-1); mon_buff_area_shrink(rp, delta); } } else { ep->flag_data = data_tag; } spin_unlock_irqrestore(&rp->b_lock, flags); wake_up(&rp->b_wait); } static void mon_bin_submit(void *data, struct urb *urb) { struct mon_reader_bin *rp = data; mon_bin_event(rp, urb, 'S', -EINPROGRESS); } static void mon_bin_complete(void *data, struct urb *urb, int status) { struct mon_reader_bin *rp = data; mon_bin_event(rp, urb, 'C', status); } static void mon_bin_error(void *data, struct urb *urb, int error) { struct mon_reader_bin *rp = data; struct timespec64 ts; unsigned long flags; unsigned int offset; struct mon_bin_hdr *ep; ktime_get_real_ts64(&ts); spin_lock_irqsave(&rp->b_lock, flags); offset = mon_buff_area_alloc(rp, PKT_SIZE); if (offset == ~0) { /* Not incrementing cnt_lost. Just because. */ spin_unlock_irqrestore(&rp->b_lock, flags); return; } ep = MON_OFF2HDR(rp, offset); memset(ep, 0, PKT_SIZE); ep->type = 'E'; ep->xfer_type = xfer_to_pipe[usb_endpoint_type(&urb->ep->desc)]; ep->epnum = usb_urb_dir_in(urb) ? USB_DIR_IN : 0; ep->epnum |= usb_endpoint_num(&urb->ep->desc); ep->devnum = urb->dev->devnum; ep->busnum = urb->dev->bus->busnum; ep->id = (unsigned long) urb; ep->ts_sec = ts.tv_sec; ep->ts_usec = ts.tv_nsec / NSEC_PER_USEC; ep->status = error; ep->flag_setup = '-'; ep->flag_data = 'E'; spin_unlock_irqrestore(&rp->b_lock, flags); wake_up(&rp->b_wait); } static int mon_bin_open(struct inode *inode, struct file *file) { struct mon_bus *mbus; struct mon_reader_bin *rp; size_t size; int rc; mutex_lock(&mon_lock); mbus = mon_bus_lookup(iminor(inode)); if (mbus == NULL) { mutex_unlock(&mon_lock); return -ENODEV; } if (mbus != &mon_bus0 && mbus->u_bus == NULL) { printk(KERN_ERR TAG ": consistency error on open\n"); mutex_unlock(&mon_lock); return -ENODEV; } rp = kzalloc(sizeof(struct mon_reader_bin), GFP_KERNEL); if (rp == NULL) { rc = -ENOMEM; goto err_alloc; } spin_lock_init(&rp->b_lock); init_waitqueue_head(&rp->b_wait); mutex_init(&rp->fetch_lock); rp->b_size = BUFF_DFL; size = sizeof(struct mon_pgmap) * (rp->b_size/CHUNK_SIZE); if ((rp->b_vec = kzalloc(size, GFP_KERNEL)) == NULL) { rc = -ENOMEM; goto err_allocvec; } if ((rc = mon_alloc_buff(rp->b_vec, rp->b_size/CHUNK_SIZE)) < 0) goto err_allocbuff; rp->r.m_bus = mbus; rp->r.r_data = rp; rp->r.rnf_submit = mon_bin_submit; rp->r.rnf_error = mon_bin_error; rp->r.rnf_complete = mon_bin_complete; mon_reader_add(mbus, &rp->r); file->private_data = rp; mutex_unlock(&mon_lock); return 0; err_allocbuff: kfree(rp->b_vec); err_allocvec: kfree(rp); err_alloc: mutex_unlock(&mon_lock); return rc; } /* * Extract an event from buffer and copy it to user space. * Wait if there is no event ready. * Returns zero or error. */ static int mon_bin_get_event(struct file *file, struct mon_reader_bin *rp, struct mon_bin_hdr __user *hdr, unsigned int hdrbytes, void __user *data, unsigned int nbytes) { unsigned long flags; struct mon_bin_hdr *ep; size_t step_len; unsigned int offset; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } ep = MON_OFF2HDR(rp, rp->b_out); if (copy_to_user(hdr, ep, hdrbytes)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } step_len = min(ep->len_cap, nbytes); if ((offset = rp->b_out + PKT_SIZE) >= rp->b_size) offset = 0; if (copy_from_buf(rp, offset, data, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } spin_lock_irqsave(&rp->b_lock, flags); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; mutex_unlock(&rp->fetch_lock); return 0; } static int mon_bin_release(struct inode *inode, struct file *file) { struct mon_reader_bin *rp = file->private_data; struct mon_bus* mbus = rp->r.m_bus; mutex_lock(&mon_lock); if (mbus->nreaders <= 0) { printk(KERN_ERR TAG ": consistency error on close\n"); mutex_unlock(&mon_lock); return 0; } mon_reader_del(mbus, &rp->r); mon_free_buff(rp->b_vec, rp->b_size/CHUNK_SIZE); kfree(rp->b_vec); kfree(rp); mutex_unlock(&mon_lock); return 0; } static ssize_t mon_bin_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct mon_reader_bin *rp = file->private_data; unsigned int hdrbytes = PKT_SZ_API0; unsigned long flags; struct mon_bin_hdr *ep; unsigned int offset; size_t step_len; char *ptr; ssize_t done = 0; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } ep = MON_OFF2HDR(rp, rp->b_out); if (rp->b_read < hdrbytes) { step_len = min(nbytes, (size_t)(hdrbytes - rp->b_read)); ptr = ((char *)ep) + rp->b_read; if (step_len && copy_to_user(buf, ptr, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nbytes -= step_len; buf += step_len; rp->b_read += step_len; done += step_len; } if (rp->b_read >= hdrbytes) { step_len = ep->len_cap; step_len -= rp->b_read - hdrbytes; if (step_len > nbytes) step_len = nbytes; offset = rp->b_out + PKT_SIZE; offset += rp->b_read - hdrbytes; if (offset >= rp->b_size) offset -= rp->b_size; if (copy_from_buf(rp, offset, buf, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nbytes -= step_len; buf += step_len; rp->b_read += step_len; done += step_len; } /* * Check if whole packet was read, and if so, jump to the next one. */ if (rp->b_read >= hdrbytes + ep->len_cap) { spin_lock_irqsave(&rp->b_lock, flags); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; } mutex_unlock(&rp->fetch_lock); return done; } /* * Remove at most nevents from chunked buffer. * Returns the number of removed events. */ static int mon_bin_flush(struct mon_reader_bin *rp, unsigned nevents) { unsigned long flags; struct mon_bin_hdr *ep; int i; mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); for (i = 0; i < nevents; ++i) { if (MON_RING_EMPTY(rp)) break; ep = MON_OFF2HDR(rp, rp->b_out); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); } spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; mutex_unlock(&rp->fetch_lock); return i; } /* * Fetch at most max event offsets into the buffer and put them into vec. * The events are usually freed later with mon_bin_flush. * Return the effective number of events fetched. */ static int mon_bin_fetch(struct file *file, struct mon_reader_bin *rp, u32 __user *vec, unsigned int max) { unsigned int cur_out; unsigned int bytes, avail; unsigned int size; unsigned int nevents; struct mon_bin_hdr *ep; unsigned long flags; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } spin_lock_irqsave(&rp->b_lock, flags); avail = rp->b_cnt; spin_unlock_irqrestore(&rp->b_lock, flags); cur_out = rp->b_out; nevents = 0; bytes = 0; while (bytes < avail) { if (nevents >= max) break; ep = MON_OFF2HDR(rp, cur_out); if (put_user(cur_out, &vec[nevents])) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nevents++; size = ep->len_cap + PKT_SIZE; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if ((cur_out += size) >= rp->b_size) cur_out -= rp->b_size; bytes += size; } mutex_unlock(&rp->fetch_lock); return nevents; } /* * Count events. This is almost the same as the above mon_bin_fetch, * only we do not store offsets into user vector, and we have no limit. */ static int mon_bin_queued(struct mon_reader_bin *rp) { unsigned int cur_out; unsigned int bytes, avail; unsigned int size; unsigned int nevents; struct mon_bin_hdr *ep; unsigned long flags; mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); avail = rp->b_cnt; spin_unlock_irqrestore(&rp->b_lock, flags); cur_out = rp->b_out; nevents = 0; bytes = 0; while (bytes < avail) { ep = MON_OFF2HDR(rp, cur_out); nevents++; size = ep->len_cap + PKT_SIZE; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if ((cur_out += size) >= rp->b_size) cur_out -= rp->b_size; bytes += size; } mutex_unlock(&rp->fetch_lock); return nevents; } /* */ static long mon_bin_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct mon_reader_bin *rp = file->private_data; // struct mon_bus* mbus = rp->r.m_bus; int ret = 0; struct mon_bin_hdr *ep; unsigned long flags; switch (cmd) { case MON_IOCQ_URB_LEN: /* * N.B. This only returns the size of data, without the header. */ spin_lock_irqsave(&rp->b_lock, flags); if (!MON_RING_EMPTY(rp)) { ep = MON_OFF2HDR(rp, rp->b_out); ret = ep->len_cap; } spin_unlock_irqrestore(&rp->b_lock, flags); break; case MON_IOCQ_RING_SIZE: mutex_lock(&rp->fetch_lock); ret = rp->b_size; mutex_unlock(&rp->fetch_lock); break; case MON_IOCT_RING_SIZE: /* * Changing the buffer size will flush it's contents; the new * buffer is allocated before releasing the old one to be sure * the device will stay functional also in case of memory * pressure. */ { int size; struct mon_pgmap *vec; if (arg < BUFF_MIN || arg > BUFF_MAX) return -EINVAL; size = CHUNK_ALIGN(arg); vec = kcalloc(size / CHUNK_SIZE, sizeof(struct mon_pgmap), GFP_KERNEL); if (vec == NULL) { ret = -ENOMEM; break; } ret = mon_alloc_buff(vec, size/CHUNK_SIZE); if (ret < 0) { kfree(vec); break; } mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); if (rp->mmap_active) { mon_free_buff(vec, size/CHUNK_SIZE); kfree(vec); ret = -EBUSY; } else { mon_free_buff(rp->b_vec, rp->b_size/CHUNK_SIZE); kfree(rp->b_vec); rp->b_vec = vec; rp->b_size = size; rp->b_read = rp->b_in = rp->b_out = rp->b_cnt = 0; rp->cnt_lost = 0; } spin_unlock_irqrestore(&rp->b_lock, flags); mutex_unlock(&rp->fetch_lock); } break; case MON_IOCH_MFLUSH: ret = mon_bin_flush(rp, arg); break; case MON_IOCX_GET: case MON_IOCX_GETX: { struct mon_bin_get getb; if (copy_from_user(&getb, (void __user *)arg, sizeof(struct mon_bin_get))) return -EFAULT; if (getb.alloc > 0x10000000) /* Want to cast to u32 */ return -EINVAL; ret = mon_bin_get_event(file, rp, getb.hdr, (cmd == MON_IOCX_GET)? PKT_SZ_API0: PKT_SZ_API1, getb.data, (unsigned int)getb.alloc); } break; case MON_IOCX_MFETCH: { struct mon_bin_mfetch mfetch; struct mon_bin_mfetch __user *uptr; uptr = (struct mon_bin_mfetch __user *)arg; if (copy_from_user(&mfetch, uptr, sizeof(mfetch))) return -EFAULT; if (mfetch.nflush) { ret = mon_bin_flush(rp, mfetch.nflush); if (ret < 0) return ret; if (put_user(ret, &uptr->nflush)) return -EFAULT; } ret = mon_bin_fetch(file, rp, mfetch.offvec, mfetch.nfetch); if (ret < 0) return ret; if (put_user(ret, &uptr->nfetch)) return -EFAULT; ret = 0; } break; case MON_IOCG_STATS: { struct mon_bin_stats __user *sp; unsigned int nevents; unsigned int ndropped; spin_lock_irqsave(&rp->b_lock, flags); ndropped = rp->cnt_lost; rp->cnt_lost = 0; spin_unlock_irqrestore(&rp->b_lock, flags); nevents = mon_bin_queued(rp); sp = (struct mon_bin_stats __user *)arg; if (put_user(ndropped, &sp->dropped)) return -EFAULT; if (put_user(nevents, &sp->queued)) return -EFAULT; } break; default: return -ENOTTY; } return ret; } #ifdef CONFIG_COMPAT static long mon_bin_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct mon_reader_bin *rp = file->private_data; int ret; switch (cmd) { case MON_IOCX_GET32: case MON_IOCX_GETX32: { struct mon_bin_get32 getb; if (copy_from_user(&getb, (void __user *)arg, sizeof(struct mon_bin_get32))) return -EFAULT; ret = mon_bin_get_event(file, rp, compat_ptr(getb.hdr32), (cmd == MON_IOCX_GET32)? PKT_SZ_API0: PKT_SZ_API1, compat_ptr(getb.data32), getb.alloc32); if (ret < 0) return ret; } return 0; case MON_IOCX_MFETCH32: { struct mon_bin_mfetch32 mfetch; struct mon_bin_mfetch32 __user *uptr; uptr = (struct mon_bin_mfetch32 __user *) compat_ptr(arg); if (copy_from_user(&mfetch, uptr, sizeof(mfetch))) return -EFAULT; if (mfetch.nflush32) { ret = mon_bin_flush(rp, mfetch.nflush32); if (ret < 0) return ret; if (put_user(ret, &uptr->nflush32)) return -EFAULT; } ret = mon_bin_fetch(file, rp, compat_ptr(mfetch.offvec32), mfetch.nfetch32); if (ret < 0) return ret; if (put_user(ret, &uptr->nfetch32)) return -EFAULT; } return 0; case MON_IOCG_STATS: return mon_bin_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); case MON_IOCQ_URB_LEN: case MON_IOCQ_RING_SIZE: case MON_IOCT_RING_SIZE: case MON_IOCH_MFLUSH: return mon_bin_ioctl(file, cmd, arg); default: ; } return -ENOTTY; } #endif /* CONFIG_COMPAT */ static __poll_t mon_bin_poll(struct file *file, struct poll_table_struct *wait) { struct mon_reader_bin *rp = file->private_data; __poll_t mask = 0; unsigned long flags; if (file->f_mode & FMODE_READ) poll_wait(file, &rp->b_wait, wait); spin_lock_irqsave(&rp->b_lock, flags); if (!MON_RING_EMPTY(rp)) mask |= EPOLLIN | EPOLLRDNORM; /* readable */ spin_unlock_irqrestore(&rp->b_lock, flags); return mask; } /* * open and close: just keep track of how many times the device is * mapped, to use the proper memory allocation function. */ static void mon_bin_vma_open(struct vm_area_struct *vma) { struct mon_reader_bin *rp = vma->vm_private_data; unsigned long flags; spin_lock_irqsave(&rp->b_lock, flags); rp->mmap_active++; spin_unlock_irqrestore(&rp->b_lock, flags); } static void mon_bin_vma_close(struct vm_area_struct *vma) { unsigned long flags; struct mon_reader_bin *rp = vma->vm_private_data; spin_lock_irqsave(&rp->b_lock, flags); rp->mmap_active--; spin_unlock_irqrestore(&rp->b_lock, flags); } /* * Map ring pages to user space. */ static vm_fault_t mon_bin_vma_fault(struct vm_fault *vmf) { struct mon_reader_bin *rp = vmf->vma->vm_private_data; unsigned long offset, chunk_idx; struct page *pageptr; unsigned long flags; spin_lock_irqsave(&rp->b_lock, flags); offset = vmf->pgoff << PAGE_SHIFT; if (offset >= rp->b_size) { spin_unlock_irqrestore(&rp->b_lock, flags); return VM_FAULT_SIGBUS; } chunk_idx = offset / CHUNK_SIZE; pageptr = rp->b_vec[chunk_idx].pg; get_page(pageptr); vmf->page = pageptr; spin_unlock_irqrestore(&rp->b_lock, flags); return 0; } static const struct vm_operations_struct mon_bin_vm_ops = { .open = mon_bin_vma_open, .close = mon_bin_vma_close, .fault = mon_bin_vma_fault, }; static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; if (vma->vm_flags & VM_WRITE) return -EPERM; vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; } static const struct file_operations mon_fops_binary = { .owner = THIS_MODULE, .open = mon_bin_open, .llseek = no_llseek, .read = mon_bin_read, /* .write = mon_text_write, */ .poll = mon_bin_poll, .unlocked_ioctl = mon_bin_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mon_bin_compat_ioctl, #endif .release = mon_bin_release, .mmap = mon_bin_mmap, }; static int mon_bin_wait_event(struct file *file, struct mon_reader_bin *rp) { DECLARE_WAITQUEUE(waita, current); unsigned long flags; add_wait_queue(&rp->b_wait, &waita); set_current_state(TASK_INTERRUPTIBLE); spin_lock_irqsave(&rp->b_lock, flags); while (MON_RING_EMPTY(rp)) { spin_unlock_irqrestore(&rp->b_lock, flags); if (file->f_flags & O_NONBLOCK) { set_current_state(TASK_RUNNING); remove_wait_queue(&rp->b_wait, &waita); return -EWOULDBLOCK; /* Same as EAGAIN in Linux */ } schedule(); if (signal_pending(current)) { remove_wait_queue(&rp->b_wait, &waita); return -EINTR; } set_current_state(TASK_INTERRUPTIBLE); spin_lock_irqsave(&rp->b_lock, flags); } spin_unlock_irqrestore(&rp->b_lock, flags); set_current_state(TASK_RUNNING); remove_wait_queue(&rp->b_wait, &waita); return 0; } static int mon_alloc_buff(struct mon_pgmap *map, int npages) { int n; unsigned long vaddr; for (n = 0; n < npages; n++) { vaddr = get_zeroed_page(GFP_KERNEL); if (vaddr == 0) { while (n-- != 0) free_page((unsigned long) map[n].ptr); return -ENOMEM; } map[n].ptr = (unsigned char *) vaddr; map[n].pg = virt_to_page((void *) vaddr); } return 0; } static void mon_free_buff(struct mon_pgmap *map, int npages) { int n; for (n = 0; n < npages; n++) free_page((unsigned long) map[n].ptr); } int mon_bin_add(struct mon_bus *mbus, const struct usb_bus *ubus) { struct device *dev; unsigned minor = ubus? ubus->busnum: 0; if (minor >= MON_BIN_MAX_MINOR) return 0; dev = device_create(&mon_bin_class, ubus ? ubus->controller : NULL, MKDEV(MAJOR(mon_bin_dev0), minor), NULL, "usbmon%d", minor); if (IS_ERR(dev)) return 0; mbus->classdev = dev; return 1; } void mon_bin_del(struct mon_bus *mbus) { device_destroy(&mon_bin_class, mbus->classdev->devt); } int __init mon_bin_init(void) { int rc; rc = class_register(&mon_bin_class); if (rc) goto err_class; rc = alloc_chrdev_region(&mon_bin_dev0, 0, MON_BIN_MAX_MINOR, "usbmon"); if (rc < 0) goto err_dev; cdev_init(&mon_bin_cdev, &mon_fops_binary); mon_bin_cdev.owner = THIS_MODULE; rc = cdev_add(&mon_bin_cdev, mon_bin_dev0, MON_BIN_MAX_MINOR); if (rc < 0) goto err_add; return 0; err_add: unregister_chrdev_region(mon_bin_dev0, MON_BIN_MAX_MINOR); err_dev: class_unregister(&mon_bin_class); err_class: return rc; } void mon_bin_exit(void) { cdev_del(&mon_bin_cdev); unregister_chrdev_region(mon_bin_dev0, MON_BIN_MAX_MINOR); class_unregister(&mon_bin_class); }
1 1 1 1 9 8 9 1 1 9 9 9 8 9 6 1 6 6 5 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/seq_file.h> #include <net/protocol.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" /* Internal logging interface, which relies on the real LOG target modules */ #define NFLOGGER_NAME_LEN 64 int sysctl_nf_log_all_netns __read_mostly; EXPORT_SYMBOL(sysctl_nf_log_all_netns); static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); #define nft_log_dereference(logger) \ rcu_dereference_protected(logger, lockdep_is_held(&nf_log_mutex)) static struct nf_logger *__find_logger(int pf, const char *str_logger) { struct nf_logger *log; int i; for (i = 0; i < NF_LOG_TYPE_MAX; i++) { log = nft_log_dereference(loggers[pf][i]); if (!log) continue; if (!strncasecmp(str_logger, log->name, strlen(log->name))) return log; } return NULL; } int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) { const struct nf_logger *log; if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers)) return -EOPNOTSUPP; mutex_lock(&nf_log_mutex); log = nft_log_dereference(net->nf.nf_loggers[pf]); if (log == NULL) rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); return 0; } EXPORT_SYMBOL(nf_log_set); void nf_log_unset(struct net *net, const struct nf_logger *logger) { int i; const struct nf_logger *log; mutex_lock(&nf_log_mutex); for (i = 0; i < NFPROTO_NUMPROTO; i++) { log = nft_log_dereference(net->nf.nf_loggers[i]); if (log == logger) RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); } mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unset); /* return EEXIST if the same logger is registered, 0 on success. */ int nf_log_register(u_int8_t pf, struct nf_logger *logger) { int i; int ret = 0; if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; mutex_lock(&nf_log_mutex); if (pf == NFPROTO_UNSPEC) { for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { if (rcu_access_pointer(loggers[i][logger->type])) { ret = -EEXIST; goto unlock; } } for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) rcu_assign_pointer(loggers[i][logger->type], logger); } else { if (rcu_access_pointer(loggers[pf][logger->type])) { ret = -EEXIST; goto unlock; } rcu_assign_pointer(loggers[pf][logger->type], logger); } unlock: mutex_unlock(&nf_log_mutex); return ret; } EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); for (i = 0; i < NFPROTO_NUMPROTO; i++) { log = nft_log_dereference(loggers[i][logger->type]); if (log == logger) RCU_INIT_POINTER(loggers[i][logger->type], NULL); } mutex_unlock(&nf_log_mutex); synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger) { if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return -EINVAL; mutex_lock(&nf_log_mutex); if (__find_logger(pf, logger->name) == NULL) { mutex_unlock(&nf_log_mutex); return -ENOENT; } rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); return 0; } EXPORT_SYMBOL(nf_log_bind_pf); void nf_log_unbind_pf(struct net *net, u_int8_t pf) { if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return; mutex_lock(&nf_log_mutex); RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unbind_pf); int nf_logger_find_get(int pf, enum nf_log_type type) { struct nf_logger *logger; int ret = -ENOENT; if (pf >= ARRAY_SIZE(loggers)) return -EINVAL; if (type >= NF_LOG_TYPE_MAX) return -EINVAL; if (pf == NFPROTO_INET) { ret = nf_logger_find_get(NFPROTO_IPV4, type); if (ret < 0) return ret; ret = nf_logger_find_get(NFPROTO_IPV6, type); if (ret < 0) { nf_logger_put(NFPROTO_IPV4, type); return ret; } return 0; } rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); if (logger == NULL) goto out; if (try_module_get(logger->me)) ret = 0; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(nf_logger_find_get); void nf_logger_put(int pf, enum nf_log_type type) { struct nf_logger *logger; if (pf == NFPROTO_INET) { nf_logger_put(NFPROTO_IPV4, type); nf_logger_put(NFPROTO_IPV6, type); return; } rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); if (!logger) WARN_ON_ONCE(1); else module_put(logger->me); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_logger_put); void nf_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; const struct nf_logger *logger; rcu_read_lock(); if (loginfo != NULL) logger = rcu_dereference(loggers[pf][loginfo->type]); else logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } EXPORT_SYMBOL(nf_log_packet); void nf_log_trace(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; const struct nf_logger *logger; rcu_read_lock(); logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } EXPORT_SYMBOL(nf_log_trace); #define S_SIZE (1024 - (sizeof(unsigned int) + 1)) struct nf_log_buf { unsigned int count; char buf[S_SIZE + 1]; }; static struct nf_log_buf emergency, *emergency_ptr = &emergency; __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...) { va_list args; int len; if (likely(m->count < S_SIZE)) { va_start(args, f); len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args); va_end(args); if (likely(m->count + len < S_SIZE)) { m->count += len; return 0; } } m->count = S_SIZE; printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n"); return -1; } EXPORT_SYMBOL_GPL(nf_log_buf_add); struct nf_log_buf *nf_log_buf_open(void) { struct nf_log_buf *m = kmalloc(sizeof(*m), GFP_ATOMIC); if (unlikely(!m)) { local_bh_disable(); do { m = xchg(&emergency_ptr, NULL); } while (!m); } m->count = 0; return m; } EXPORT_SYMBOL_GPL(nf_log_buf_open); void nf_log_buf_close(struct nf_log_buf *m) { m->buf[m->count] = 0; printk("%s\n", m->buf); if (likely(m != &emergency)) kfree(m); else { emergency_ptr = m; local_bh_enable(); } } EXPORT_SYMBOL_GPL(nf_log_buf_close); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); mutex_lock(&nf_log_mutex); if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { struct net *net = seq_file_net(s); (*pos)++; if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; } static void seq_stop(struct seq_file *s, void *v) { mutex_unlock(&nf_log_mutex); } static int seq_show(struct seq_file *s, void *v) { loff_t *pos = v; const struct nf_logger *logger; int i; struct net *net = seq_file_net(s); logger = nft_log_dereference(net->nf.nf_loggers[*pos]); if (!logger) seq_printf(s, "%2lld NONE (", *pos); else seq_printf(s, "%2lld %s (", *pos, logger->name); if (seq_has_overflowed(s)) return -ENOSPC; for (i = 0; i < NF_LOG_TYPE_MAX; i++) { if (loggers[*pos][i] == NULL) continue; logger = nft_log_dereference(loggers[*pos][i]); seq_puts(s, logger->name); if (i == 0 && loggers[*pos][i + 1] != NULL) seq_puts(s, ","); if (seq_has_overflowed(s)) return -ENOSPC; } seq_puts(s, ")\n"); if (seq_has_overflowed(s)) return -ENOSPC; return 0; } static const struct seq_operations nflog_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { { .procname = "nf_log_all_netns", .data = &sysctl_nf_log_all_netns, .maxlen = sizeof(sysctl_nf_log_all_netns), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int nf_log_proc_dostring(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; int r = 0; int tindex = (unsigned long)table->extra1; struct net *net = table->extra2; if (write) { struct ctl_table tmp = *table; /* proc_dostring() can append to existing strings, so we need to * initialize it as an empty string. */ buf[0] = '\0'; tmp.data = buf; r = proc_dostring(&tmp, write, buffer, lenp, ppos); if (r) return r; if (!strcmp(buf, "NONE")) { nf_log_unbind_pf(net, tindex); return 0; } mutex_lock(&nf_log_mutex); logger = __find_logger(tindex, buf); if (logger == NULL) { mutex_unlock(&nf_log_mutex); return -ENOENT; } rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { struct ctl_table tmp = *table; tmp.data = buf; mutex_lock(&nf_log_mutex); logger = nft_log_dereference(net->nf.nf_loggers[tindex]); if (!logger) strscpy(buf, "NONE", sizeof(buf)); else strscpy(buf, logger->name, sizeof(buf)); mutex_unlock(&nf_log_mutex); r = proc_dostring(&tmp, write, buffer, lenp, ppos); } return r; } static int netfilter_log_sysctl_init(struct net *net) { int i; struct ctl_table *table; table = nf_log_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(nf_log_sysctl_table, sizeof(nf_log_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } else { for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { snprintf(nf_log_sysctl_fnames[i], 3, "%d", i); nf_log_sysctl_table[i].procname = nf_log_sysctl_fnames[i]; nf_log_sysctl_table[i].maxlen = NFLOGGER_NAME_LEN; nf_log_sysctl_table[i].mode = 0644; nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; } nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter", nf_log_sysctl_ftable); if (!nf_log_sysctl_fhdr) goto err_freg; } for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) table[i].extra2 = net; net->nf.nf_log_dir_header = register_net_sysctl_sz(net, "net/netfilter/nf_log", table, ARRAY_SIZE(nf_log_sysctl_table)); if (!net->nf.nf_log_dir_header) goto err_reg; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); else unregister_net_sysctl_table(nf_log_sysctl_fhdr); err_freg: err_alloc: return -ENOMEM; } static void netfilter_log_sysctl_exit(struct net *net) { const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); if (!net_eq(net, &init_net)) kfree(table); else unregister_net_sysctl_table(nf_log_sysctl_fhdr); } #else static int netfilter_log_sysctl_init(struct net *net) { return 0; } static void netfilter_log_sysctl_exit(struct net *net) { } #endif /* CONFIG_SYSCTL */ static int __net_init nf_log_net_init(struct net *net) { int ret = -ENOMEM; #ifdef CONFIG_PROC_FS if (!proc_create_net("nf_log", 0444, net->nf.proc_netfilter, &nflog_seq_ops, sizeof(struct seq_net_private))) return ret; #endif ret = netfilter_log_sysctl_init(net); if (ret < 0) goto out_sysctl; return 0; out_sysctl: #ifdef CONFIG_PROC_FS remove_proc_entry("nf_log", net->nf.proc_netfilter); #endif return ret; } static void __net_exit nf_log_net_exit(struct net *net) { netfilter_log_sysctl_exit(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nf_log", net->nf.proc_netfilter); #endif } static struct pernet_operations nf_log_net_ops = { .init = nf_log_net_init, .exit = nf_log_net_exit, }; int __init netfilter_log_init(void) { return register_pernet_subsys(&nf_log_net_ops); }
695 695 394 702 708 704 700 121 120 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpuobj_ext { #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref tag; #endif }; #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; static inline bool need_pcpuobj_ext(void) { if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) return true; if (!mem_cgroup_kmem_disabled()) return true; return false; } extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG_KMEM if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif
6 26 15 15 53 3 47 4 4 4 4 1 4 4 1 1 1 4 51 54 7 7 7 7 7 7 5 4 4 5 5 5 2 5 5 5 4 4 4 4 4 4 4 4 5 5 5 5 5 3 2 3 5 5 5 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 41 40 41 1 5 5 1 5 1 1 7 7 8 7 8 40 39 39 40 39 40 41 40 41 39 40 1 2 6 5 40 32 7 31 29 29 29 8 28 29 29 6 30 33 32 33 33 32 41 41 2 41 40 33 41 11 11 11 1 1 1 1 10 11 10 18 17 1 18 7 18 11 11 10 11 4 41 40 40 4 3 14 14 14 14 14 14 14 14 14 14 13 13 2 2 14 14 13 13 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 12 12 12 1 12 1 12 12 9 9 9 12 12 12 12 1 1 1 1 1 3 3 3 3 9 6 7 6 10 10 10 11 11 10 11 10 9 10 11 11 11 10 2 2 1 1 1 1 1 1 1 2 2 10 10 9 10 1 1 2 2 2 2 1 2 2 2 2 1 1 1 1 1 7 9 2 2 1 1 1 1 1 3 3 2 2 2 2 3 5 3 5 4 3 4 10 9 9 2 9 7 10 10 9 7 7 5 9 2 1 1 1 9 8 7 10 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 2 2 2 2 35 34 35 34 35 34 35 19 15 16 35 4 4 4 4 4 4 4 4 4 35 35 34 34 33 34 34 15 14 15 15 15 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Client Manager * Copyright (c) 1998-2001 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/minors.h> #include <linux/kmod.h> #include <sound/seq_kernel.h> #include <sound/ump.h> #include "seq_clientmgr.h" #include "seq_memory.h" #include "seq_queue.h" #include "seq_timer.h" #include "seq_info.h" #include "seq_system.h" #include "seq_ump_convert.h" #include <sound/seq_device.h> #ifdef CONFIG_COMPAT #include <linux/compat.h> #endif /* Client Manager * this module handles the connections of userland and kernel clients * */ /* * There are four ranges of client numbers (last two shared): * 0..15: global clients * 16..127: statically allocated client numbers for cards 0..27 * 128..191: dynamically allocated client numbers for cards 28..31 * 128..191: dynamically allocated client numbers for applications */ /* number of kernel non-card clients */ #define SNDRV_SEQ_GLOBAL_CLIENTS 16 /* clients per cards, for static clients */ #define SNDRV_SEQ_CLIENTS_PER_CARD 4 /* dynamically allocated client numbers (both kernel drivers and user space) */ #define SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN 128 #define SNDRV_SEQ_LFLG_INPUT 0x0001 #define SNDRV_SEQ_LFLG_OUTPUT 0x0002 #define SNDRV_SEQ_LFLG_OPEN (SNDRV_SEQ_LFLG_INPUT|SNDRV_SEQ_LFLG_OUTPUT) static DEFINE_SPINLOCK(clients_lock); static DEFINE_MUTEX(register_mutex); /* * client table */ static char clienttablock[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_client *clienttab[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_usage client_usage; /* * prototypes */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop); static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int filter, int atomic, int hop); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) static void free_ump_info(struct snd_seq_client *client); #endif /* */ static inline unsigned short snd_seq_file_flags(struct file *file) { switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_WRITE: return SNDRV_SEQ_LFLG_OUTPUT; case FMODE_READ: return SNDRV_SEQ_LFLG_INPUT; default: return SNDRV_SEQ_LFLG_OPEN; } } static inline int snd_seq_write_pool_allocated(struct snd_seq_client *client) { return snd_seq_total_cells(client->pool) > 0; } /* return pointer to client structure for specified id */ static struct snd_seq_client *clientptr(int clientid) { if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } return clienttab[clientid]; } struct snd_seq_client *snd_seq_client_use_ptr(int clientid) { unsigned long flags; struct snd_seq_client *client; if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } spin_lock_irqsave(&clients_lock, flags); client = clientptr(clientid); if (client) goto __lock; if (clienttablock[clientid]) { spin_unlock_irqrestore(&clients_lock, flags); return NULL; } spin_unlock_irqrestore(&clients_lock, flags); #ifdef CONFIG_MODULES if (!in_interrupt()) { static DECLARE_BITMAP(client_requested, SNDRV_SEQ_GLOBAL_CLIENTS); static DECLARE_BITMAP(card_requested, SNDRV_CARDS); if (clientid < SNDRV_SEQ_GLOBAL_CLIENTS) { int idx; if (!test_and_set_bit(clientid, client_requested)) { for (idx = 0; idx < 15; idx++) { if (seq_client_load[idx] < 0) break; if (seq_client_load[idx] == clientid) { request_module("snd-seq-client-%i", clientid); break; } } } } else if (clientid < SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) { int card = (clientid - SNDRV_SEQ_GLOBAL_CLIENTS) / SNDRV_SEQ_CLIENTS_PER_CARD; if (card < snd_ecards_limit) { if (!test_and_set_bit(card, card_requested)) snd_request_card(card); snd_seq_device_load_drivers(); } } spin_lock_irqsave(&clients_lock, flags); client = clientptr(clientid); if (client) goto __lock; spin_unlock_irqrestore(&clients_lock, flags); } #endif return NULL; __lock: snd_use_lock_use(&client->use_lock); spin_unlock_irqrestore(&clients_lock, flags); return client; } /* Take refcount and perform ioctl_mutex lock on the given client; * used only for OSS sequencer * Unlock via snd_seq_client_ioctl_unlock() below */ bool snd_seq_client_ioctl_lock(int clientid) { struct snd_seq_client *client; client = snd_seq_client_use_ptr(clientid); if (!client) return false; mutex_lock(&client->ioctl_mutex); /* The client isn't unrefed here; see snd_seq_client_ioctl_unlock() */ return true; } EXPORT_SYMBOL_GPL(snd_seq_client_ioctl_lock); /* Unlock and unref the given client; for OSS sequencer use only */ void snd_seq_client_ioctl_unlock(int clientid) { struct snd_seq_client *client; client = snd_seq_client_use_ptr(clientid); if (WARN_ON(!client)) return; mutex_unlock(&client->ioctl_mutex); /* The doubly unrefs below are intentional; the first one releases the * leftover from snd_seq_client_ioctl_lock() above, and the second one * is for releasing snd_seq_client_use_ptr() in this function */ snd_seq_client_unlock(client); snd_seq_client_unlock(client); } EXPORT_SYMBOL_GPL(snd_seq_client_ioctl_unlock); static void usage_alloc(struct snd_seq_usage *res, int num) { res->cur += num; if (res->cur > res->peak) res->peak = res->cur; } static void usage_free(struct snd_seq_usage *res, int num) { res->cur -= num; } /* initialise data structures */ int __init client_init_data(void) { /* zap out the client table */ memset(&clienttablock, 0, sizeof(clienttablock)); memset(&clienttab, 0, sizeof(clienttab)); return 0; } static struct snd_seq_client *seq_create_client1(int client_index, int poolsize) { int c; struct snd_seq_client *client; /* init client data */ client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return NULL; client->pool = snd_seq_pool_new(poolsize); if (client->pool == NULL) { kfree(client); return NULL; } client->type = NO_CLIENT; snd_use_lock_init(&client->use_lock); rwlock_init(&client->ports_lock); mutex_init(&client->ports_mutex); INIT_LIST_HEAD(&client->ports_list_head); mutex_init(&client->ioctl_mutex); client->ump_endpoint_port = -1; /* find free slot in the client table */ spin_lock_irq(&clients_lock); if (client_index < 0) { for (c = SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN; c < SNDRV_SEQ_MAX_CLIENTS; c++) { if (clienttab[c] || clienttablock[c]) continue; clienttab[client->number = c] = client; spin_unlock_irq(&clients_lock); return client; } } else { if (clienttab[client_index] == NULL && !clienttablock[client_index]) { clienttab[client->number = client_index] = client; spin_unlock_irq(&clients_lock); return client; } } spin_unlock_irq(&clients_lock); snd_seq_pool_delete(&client->pool); kfree(client); return NULL; /* no free slot found or busy, return failure code */ } static int seq_free_client1(struct snd_seq_client *client) { if (!client) return 0; spin_lock_irq(&clients_lock); clienttablock[client->number] = 1; clienttab[client->number] = NULL; spin_unlock_irq(&clients_lock); snd_seq_delete_all_ports(client); snd_seq_queue_client_leave(client->number); snd_use_lock_sync(&client->use_lock); if (client->pool) snd_seq_pool_delete(&client->pool); spin_lock_irq(&clients_lock); clienttablock[client->number] = 0; spin_unlock_irq(&clients_lock); return 0; } static void seq_free_client(struct snd_seq_client * client) { mutex_lock(&register_mutex); switch (client->type) { case NO_CLIENT: pr_warn("ALSA: seq: Trying to free unused client %d\n", client->number); break; case USER_CLIENT: case KERNEL_CLIENT: seq_free_client1(client); usage_free(&client_usage, 1); break; default: pr_err("ALSA: seq: Trying to free client %d with undefined type = %d\n", client->number, client->type); } mutex_unlock(&register_mutex); snd_seq_system_client_ev_client_exit(client->number); } /* -------------------------------------------------------- */ /* create a user client */ static int snd_seq_open(struct inode *inode, struct file *file) { int c, mode; /* client id */ struct snd_seq_client *client; struct snd_seq_user_client *user; int err; err = stream_open(inode, file); if (err < 0) return err; mutex_lock(&register_mutex); client = seq_create_client1(-1, SNDRV_SEQ_DEFAULT_EVENTS); if (!client) { mutex_unlock(&register_mutex); return -ENOMEM; /* failure code */ } mode = snd_seq_file_flags(file); if (mode & SNDRV_SEQ_LFLG_INPUT) client->accept_input = 1; if (mode & SNDRV_SEQ_LFLG_OUTPUT) client->accept_output = 1; user = &client->data.user; user->fifo = NULL; user->fifo_pool_size = 0; if (mode & SNDRV_SEQ_LFLG_INPUT) { user->fifo_pool_size = SNDRV_SEQ_DEFAULT_CLIENT_EVENTS; user->fifo = snd_seq_fifo_new(user->fifo_pool_size); if (user->fifo == NULL) { seq_free_client1(client); kfree(client); mutex_unlock(&register_mutex); return -ENOMEM; } } usage_alloc(&client_usage, 1); client->type = USER_CLIENT; mutex_unlock(&register_mutex); c = client->number; file->private_data = client; /* fill client data */ user->file = file; sprintf(client->name, "Client-%d", c); client->data.user.owner = get_pid(task_pid(current)); /* make others aware this new client */ snd_seq_system_client_ev_client_start(c); return 0; } /* delete a user client */ static int snd_seq_release(struct inode *inode, struct file *file) { struct snd_seq_client *client = file->private_data; if (client) { seq_free_client(client); if (client->data.user.fifo) snd_seq_fifo_delete(&client->data.user.fifo); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) free_ump_info(client); #endif put_pid(client->data.user.owner); kfree(client); } return 0; } static bool event_is_compatible(const struct snd_seq_client *client, const struct snd_seq_event *ev) { if (snd_seq_ev_is_ump(ev) && !client->midi_version) return false; if (snd_seq_ev_is_ump(ev) && snd_seq_ev_is_variable(ev)) return false; return true; } /* handle client read() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOSPC FIFO overflow (the flag is cleared after this error report) * -EINVAL no enough user-space buffer to write the whole event * -EFAULT seg. fault during copy to user space */ static ssize_t snd_seq_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; struct snd_seq_fifo *fifo; size_t aligned_size; int err; long result = 0; struct snd_seq_event_cell *cell; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT)) return -ENXIO; if (!access_ok(buf, count)) return -EFAULT; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_input) return -ENXIO; fifo = client->data.user.fifo; if (!fifo) return -ENXIO; if (atomic_read(&fifo->overflow) > 0) { /* buffer overflow is detected */ snd_seq_fifo_clear(fifo); /* return error code */ return -ENOSPC; } cell = NULL; err = 0; snd_seq_fifo_lock(fifo); if (IS_ENABLED(CONFIG_SND_SEQ_UMP) && client->midi_version > 0) aligned_size = sizeof(struct snd_seq_ump_event); else aligned_size = sizeof(struct snd_seq_event); /* while data available in queue */ while (count >= aligned_size) { int nonblock; nonblock = (file->f_flags & O_NONBLOCK) || result > 0; err = snd_seq_fifo_cell_out(fifo, &cell, nonblock); if (err < 0) break; if (!event_is_compatible(client, &cell->event)) { snd_seq_cell_free(cell); cell = NULL; continue; } if (snd_seq_ev_is_variable(&cell->event)) { struct snd_seq_ump_event tmpev; memcpy(&tmpev, &cell->event, aligned_size); tmpev.data.ext.len &= ~SNDRV_SEQ_EXT_MASK; if (copy_to_user(buf, &tmpev, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; err = snd_seq_expand_var_event(&cell->event, count, (char __force *)buf, 0, aligned_size); if (err < 0) break; result += err; count -= err; buf += err; } else { if (copy_to_user(buf, &cell->event, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; } snd_seq_cell_free(cell); cell = NULL; /* to be sure */ result += aligned_size; } if (err < 0) { if (cell) snd_seq_fifo_cell_putback(fifo, cell); if (err == -EAGAIN && result > 0) err = 0; } snd_seq_fifo_unlock(fifo); return (err < 0) ? err : result; } /* * check access permission to the port */ static int check_port_perm(struct snd_seq_client_port *port, unsigned int flags) { if ((port->capability & flags) != flags) return 0; return flags; } /* * check if the destination client is available, and return the pointer * if filter is non-zero, client filter bitmap is tested. */ static struct snd_seq_client *get_event_dest_client(struct snd_seq_event *event, int filter) { struct snd_seq_client *dest; dest = snd_seq_client_use_ptr(event->dest.client); if (dest == NULL) return NULL; if (! dest->accept_input) goto __not_avail; if ((dest->filter & SNDRV_SEQ_FILTER_USE_EVENT) && ! test_bit(event->type, dest->event_filter)) goto __not_avail; if (filter && !(dest->filter & filter)) goto __not_avail; return dest; /* ok - accessible */ __not_avail: snd_seq_client_unlock(dest); return NULL; } /* * Return the error event. * * If the receiver client is a user client, the original event is * encapsulated in SNDRV_SEQ_EVENT_BOUNCE as variable length event. If * the original event is also variable length, the external data is * copied after the event record. * If the receiver client is a kernel client, the original event is * quoted in SNDRV_SEQ_EVENT_KERNEL_ERROR, since this requires no extra * kmalloc. */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop) { struct snd_seq_event bounce_ev; int result; if (client == NULL || ! (client->filter & SNDRV_SEQ_FILTER_BOUNCE) || ! client->accept_input) return 0; /* ignored */ /* set up quoted error */ memset(&bounce_ev, 0, sizeof(bounce_ev)); bounce_ev.type = SNDRV_SEQ_EVENT_KERNEL_ERROR; bounce_ev.flags = SNDRV_SEQ_EVENT_LENGTH_FIXED; bounce_ev.queue = SNDRV_SEQ_QUEUE_DIRECT; bounce_ev.source.client = SNDRV_SEQ_CLIENT_SYSTEM; bounce_ev.source.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; bounce_ev.dest.client = client->number; bounce_ev.dest.port = event->source.port; bounce_ev.data.quote.origin = event->dest; bounce_ev.data.quote.event = event; bounce_ev.data.quote.value = -err; /* use positive value */ result = snd_seq_deliver_single_event(NULL, &bounce_ev, 0, atomic, hop + 1); if (result < 0) { client->event_lost++; return result; } return result; } /* * rewrite the time-stamp of the event record with the curren time * of the given queue. * return non-zero if updated. */ static int update_timestamp_of_queue(struct snd_seq_event *event, int queue, int real_time) { struct snd_seq_queue *q; q = queueptr(queue); if (! q) return 0; event->queue = queue; event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK; if (real_time) { event->time.time = snd_seq_timer_get_cur_time(q->timer, true); event->flags |= SNDRV_SEQ_TIME_STAMP_REAL; } else { event->time.tick = snd_seq_timer_get_cur_tick(q->timer); event->flags |= SNDRV_SEQ_TIME_STAMP_TICK; } queuefree(q); return 1; } /* deliver a single event; called from below and UMP converter */ int __snd_seq_deliver_single_event(struct snd_seq_client *dest, struct snd_seq_client_port *dest_port, struct snd_seq_event *event, int atomic, int hop) { switch (dest->type) { case USER_CLIENT: if (!dest->data.user.fifo) return 0; return snd_seq_fifo_event_in(dest->data.user.fifo, event); case KERNEL_CLIENT: if (!dest_port->event_input) return 0; return dest_port->event_input(event, snd_seq_ev_is_direct(event), dest_port->private_data, atomic, hop); } return 0; } /* * deliver an event to the specified destination. * if filter is non-zero, client filter bitmap is tested. * * RETURN VALUE: 0 : if succeeded * <0 : error */ static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int filter, int atomic, int hop) { struct snd_seq_client *dest = NULL; struct snd_seq_client_port *dest_port = NULL; int result = -ENOENT; int direct; direct = snd_seq_ev_is_direct(event); dest = get_event_dest_client(event, filter); if (dest == NULL) goto __skip; dest_port = snd_seq_port_use_ptr(dest, event->dest.port); if (dest_port == NULL) goto __skip; /* check permission */ if (! check_port_perm(dest_port, SNDRV_SEQ_PORT_CAP_WRITE)) { result = -EPERM; goto __skip; } if (dest_port->timestamping) update_timestamp_of_queue(event, dest_port->time_queue, dest_port->time_real); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) { if (snd_seq_ev_is_ump(event)) { result = snd_seq_deliver_from_ump(client, dest, dest_port, event, atomic, hop); goto __skip; } else if (snd_seq_client_is_ump(dest)) { result = snd_seq_deliver_to_ump(client, dest, dest_port, event, atomic, hop); goto __skip; } } #endif /* CONFIG_SND_SEQ_UMP */ result = __snd_seq_deliver_single_event(dest, dest_port, event, atomic, hop); __skip: if (dest_port) snd_seq_port_unlock(dest_port); if (dest) snd_seq_client_unlock(dest); if (result < 0 && !direct) { result = bounce_error_event(client, event, result, atomic, hop); } return result; } /* * send the event to all subscribers: */ static int __deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, struct snd_seq_client_port *src_port, int atomic, int hop) { struct snd_seq_subscribers *subs; int err, result = 0, num_ev = 0; union __snd_seq_event event_saved; size_t saved_size; struct snd_seq_port_subs_info *grp; /* save original event record */ saved_size = snd_seq_event_packet_size(event); memcpy(&event_saved, event, saved_size); grp = &src_port->c_src; /* lock list */ if (atomic) read_lock(&grp->list_lock); else down_read_nested(&grp->list_mutex, hop); list_for_each_entry(subs, &grp->list_head, src_list) { /* both ports ready? */ if (atomic_read(&subs->ref_count) != 2) continue; event->dest = subs->info.dest; if (subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) /* convert time according to flag with subscription */ update_timestamp_of_queue(event, subs->info.queue, subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL); err = snd_seq_deliver_single_event(client, event, 0, atomic, hop); if (err < 0) { /* save first error that occurs and continue */ if (!result) result = err; continue; } num_ev++; /* restore original event record */ memcpy(event, &event_saved, saved_size); } if (atomic) read_unlock(&grp->list_lock); else up_read(&grp->list_mutex); memcpy(event, &event_saved, saved_size); return (result < 0) ? result : num_ev; } static int deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { struct snd_seq_client_port *src_port; int ret = 0, ret2; src_port = snd_seq_port_use_ptr(client, event->source.port); if (src_port) { ret = __deliver_to_subscribers(client, event, src_port, atomic, hop); snd_seq_port_unlock(src_port); } if (client->ump_endpoint_port < 0 || event->source.port == client->ump_endpoint_port) return ret; src_port = snd_seq_port_use_ptr(client, client->ump_endpoint_port); if (!src_port) return ret; ret2 = __deliver_to_subscribers(client, event, src_port, atomic, hop); snd_seq_port_unlock(src_port); return ret2 < 0 ? ret2 : ret; } /* deliver an event to the destination port(s). * if the event is to subscribers or broadcast, the event is dispatched * to multiple targets. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ static int snd_seq_deliver_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int result; hop++; if (hop >= SNDRV_SEQ_MAX_HOPS) { pr_debug("ALSA: seq: too long delivery path (%d:%d->%d:%d)\n", event->source.client, event->source.port, event->dest.client, event->dest.port); return -EMLINK; } if (snd_seq_ev_is_variable(event) && snd_BUG_ON(atomic && (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR))) return -EINVAL; if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS || event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) result = deliver_to_subscribers(client, event, atomic, hop); else result = snd_seq_deliver_single_event(client, event, 0, atomic, hop); return result; } /* * dispatch an event cell: * This function is called only from queue check routines in timer * interrupts or after enqueued. * The event cell shall be released or re-queued in this function. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ int snd_seq_dispatch_event(struct snd_seq_event_cell *cell, int atomic, int hop) { struct snd_seq_client *client; int result; if (snd_BUG_ON(!cell)) return -EINVAL; client = snd_seq_client_use_ptr(cell->event.source.client); if (client == NULL) { snd_seq_cell_free(cell); /* release this cell */ return -EINVAL; } if (!snd_seq_ev_is_ump(&cell->event) && cell->event.type == SNDRV_SEQ_EVENT_NOTE) { /* NOTE event: * the event cell is re-used as a NOTE-OFF event and * enqueued again. */ struct snd_seq_event tmpev, *ev; /* reserve this event to enqueue note-off later */ tmpev = cell->event; tmpev.type = SNDRV_SEQ_EVENT_NOTEON; result = snd_seq_deliver_event(client, &tmpev, atomic, hop); /* * This was originally a note event. We now re-use the * cell for the note-off event. */ ev = &cell->event; ev->type = SNDRV_SEQ_EVENT_NOTEOFF; ev->flags |= SNDRV_SEQ_PRIORITY_HIGH; /* add the duration time */ switch (ev->flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: cell->event.time.tick += ev->data.note.duration; break; case SNDRV_SEQ_TIME_STAMP_REAL: /* unit for duration is ms */ ev->time.time.tv_nsec += 1000000 * (ev->data.note.duration % 1000); ev->time.time.tv_sec += ev->data.note.duration / 1000 + ev->time.time.tv_nsec / 1000000000; ev->time.time.tv_nsec %= 1000000000; break; } ev->data.note.velocity = ev->data.note.off_velocity; /* Now queue this cell as the note off event */ if (snd_seq_enqueue_event(cell, atomic, hop) < 0) snd_seq_cell_free(cell); /* release this cell */ } else { /* Normal events: * event cell is freed after processing the event */ result = snd_seq_deliver_event(client, &cell->event, atomic, hop); snd_seq_cell_free(cell); } snd_seq_client_unlock(client); return result; } /* Allocate a cell from client pool and enqueue it to queue: * if pool is empty and blocking is TRUE, sleep until a new cell is * available. */ static int snd_seq_client_enqueue_event(struct snd_seq_client *client, struct snd_seq_event *event, struct file *file, int blocking, int atomic, int hop, struct mutex *mutexp) { struct snd_seq_event_cell *cell; int err; /* special queue values - force direct passing */ if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { event->dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; event->queue = SNDRV_SEQ_QUEUE_DIRECT; } else if (event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { /* check presence of source port */ struct snd_seq_client_port *src_port = snd_seq_port_use_ptr(client, event->source.port); if (src_port == NULL) return -EINVAL; snd_seq_port_unlock(src_port); } /* direct event processing without enqueued */ if (snd_seq_ev_is_direct(event)) { if (!snd_seq_ev_is_ump(event) && event->type == SNDRV_SEQ_EVENT_NOTE) return -EINVAL; /* this event must be enqueued! */ return snd_seq_deliver_event(client, event, atomic, hop); } /* Not direct, normal queuing */ if (snd_seq_queue_is_used(event->queue, client->number) <= 0) return -EINVAL; /* invalid queue */ if (! snd_seq_write_pool_allocated(client)) return -ENXIO; /* queue is not allocated */ /* allocate an event cell */ err = snd_seq_event_dup(client->pool, event, &cell, !blocking || atomic, file, mutexp); if (err < 0) return err; /* we got a cell. enqueue it. */ err = snd_seq_enqueue_event(cell, atomic, hop); if (err < 0) { snd_seq_cell_free(cell); return err; } return 0; } /* * check validity of event type and data length. * return non-zero if invalid. */ static int check_event_type_and_length(struct snd_seq_event *ev) { switch (snd_seq_ev_length_type(ev)) { case SNDRV_SEQ_EVENT_LENGTH_FIXED: if (snd_seq_ev_is_variable_type(ev)) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARIABLE: if (! snd_seq_ev_is_variable_type(ev) || (ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK) >= SNDRV_SEQ_MAX_EVENT_LEN) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARUSR: if (! snd_seq_ev_is_direct(ev)) return -EINVAL; break; } return 0; } /* handle write() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOMEM malloc failed * -EFAULT seg. fault during copy from user space * -EINVAL invalid event * -EAGAIN no space in output pool * -EINTR interrupts while sleep * -EMLINK too many hops * others depends on return value from driver callback */ static ssize_t snd_seq_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; int written = 0, len; int err, handled; union __snd_seq_event __event; struct snd_seq_event *ev = &__event.legacy; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT)) return -ENXIO; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_output || client->pool == NULL) return -ENXIO; repeat: handled = 0; /* allocate the pool now if the pool is not allocated yet */ mutex_lock(&client->ioctl_mutex); if (client->pool->size > 0 && !snd_seq_write_pool_allocated(client)) { err = snd_seq_pool_init(client->pool); if (err < 0) goto out; } /* only process whole events */ err = -EINVAL; while (count >= sizeof(struct snd_seq_event)) { /* Read in the event header from the user */ len = sizeof(struct snd_seq_event); if (copy_from_user(ev, buf, len)) { err = -EFAULT; break; } /* read in the rest bytes for UMP events */ if (snd_seq_ev_is_ump(ev)) { if (count < sizeof(struct snd_seq_ump_event)) break; if (copy_from_user((char *)ev + len, buf + len, sizeof(struct snd_seq_ump_event) - len)) { err = -EFAULT; break; } len = sizeof(struct snd_seq_ump_event); } ev->source.client = client->number; /* fill in client number */ /* Check for extension data length */ if (check_event_type_and_length(ev)) { err = -EINVAL; break; } if (!event_is_compatible(client, ev)) { err = -EINVAL; break; } /* check for special events */ if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) goto __skip_event; else if (snd_seq_ev_is_reserved(ev)) { err = -EINVAL; break; } } if (snd_seq_ev_is_variable(ev)) { int extlen = ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK; if ((size_t)(extlen + len) > count) { /* back out, will get an error this time or next */ err = -EINVAL; break; } /* set user space pointer */ ev->data.ext.len = extlen | SNDRV_SEQ_EXT_USRPTR; ev->data.ext.ptr = (char __force *)buf + len; len += extlen; /* increment data length */ } else { #ifdef CONFIG_COMPAT if (client->convert32 && snd_seq_ev_is_varusr(ev)) ev->data.ext.ptr = (void __force *)compat_ptr(ev->data.raw32.d[1]); #endif } /* ok, enqueue it */ err = snd_seq_client_enqueue_event(client, ev, file, !(file->f_flags & O_NONBLOCK), 0, 0, &client->ioctl_mutex); if (err < 0) break; handled++; __skip_event: /* Update pointers and counts */ count -= len; buf += len; written += len; /* let's have a coffee break if too many events are queued */ if (++handled >= 200) { mutex_unlock(&client->ioctl_mutex); goto repeat; } } out: mutex_unlock(&client->ioctl_mutex); return written ? written : err; } /* * handle polling */ static __poll_t snd_seq_poll(struct file *file, poll_table * wait) { struct snd_seq_client *client = file->private_data; __poll_t mask = 0; /* check client structures are in place */ if (snd_BUG_ON(!client)) return EPOLLERR; if ((snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT) && client->data.user.fifo) { /* check if data is available in the outqueue */ if (snd_seq_fifo_poll_wait(client->data.user.fifo, file, wait)) mask |= EPOLLIN | EPOLLRDNORM; } if (snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT) { /* check if data is available in the pool */ if (!snd_seq_write_pool_allocated(client) || snd_seq_pool_poll_wait(client->pool, file, wait)) mask |= EPOLLOUT | EPOLLWRNORM; } return mask; } /*-----------------------------------------------------*/ static int snd_seq_ioctl_pversion(struct snd_seq_client *client, void *arg) { int *pversion = arg; *pversion = SNDRV_SEQ_VERSION; return 0; } static int snd_seq_ioctl_user_pversion(struct snd_seq_client *client, void *arg) { client->user_pversion = *(unsigned int *)arg; return 0; } static int snd_seq_ioctl_client_id(struct snd_seq_client *client, void *arg) { int *client_id = arg; *client_id = client->number; return 0; } /* SYSTEM_INFO ioctl() */ static int snd_seq_ioctl_system_info(struct snd_seq_client *client, void *arg) { struct snd_seq_system_info *info = arg; memset(info, 0, sizeof(*info)); /* fill the info fields */ info->queues = SNDRV_SEQ_MAX_QUEUES; info->clients = SNDRV_SEQ_MAX_CLIENTS; info->ports = SNDRV_SEQ_MAX_PORTS; info->channels = 256; /* fixed limit */ info->cur_clients = client_usage.cur; info->cur_queues = snd_seq_queue_get_cur_queues(); return 0; } /* RUNNING_MODE ioctl() */ static int snd_seq_ioctl_running_mode(struct snd_seq_client *client, void *arg) { struct snd_seq_running_info *info = arg; struct snd_seq_client *cptr; int err = 0; /* requested client number */ cptr = snd_seq_client_use_ptr(info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ #ifdef SNDRV_BIG_ENDIAN if (!info->big_endian) { err = -EINVAL; goto __err; } #else if (info->big_endian) { err = -EINVAL; goto __err; } #endif if (info->cpu_mode > sizeof(long)) { err = -EINVAL; goto __err; } cptr->convert32 = (info->cpu_mode < sizeof(long)); __err: snd_seq_client_unlock(cptr); return err; } /* CLIENT_INFO ioctl() */ static void get_client_info(struct snd_seq_client *cptr, struct snd_seq_client_info *info) { info->client = cptr->number; /* fill the info fields */ info->type = cptr->type; strcpy(info->name, cptr->name); info->filter = cptr->filter; info->event_lost = cptr->event_lost; memcpy(info->event_filter, cptr->event_filter, 32); info->group_filter = cptr->group_filter; info->num_ports = cptr->num_ports; if (cptr->type == USER_CLIENT) info->pid = pid_vnr(cptr->data.user.owner); else info->pid = -1; if (cptr->type == KERNEL_CLIENT) info->card = cptr->data.kernel.card ? cptr->data.kernel.card->number : -1; else info->card = -1; info->midi_version = cptr->midi_version; memset(info->reserved, 0, sizeof(info->reserved)); } static int snd_seq_ioctl_get_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; struct snd_seq_client *cptr; /* requested client number */ cptr = snd_seq_client_use_ptr(client_info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ get_client_info(cptr, client_info); snd_seq_client_unlock(cptr); return 0; } /* CLIENT_INFO ioctl() */ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; /* it is not allowed to set the info fields for an another client */ if (client->number != client_info->client) return -EPERM; /* also client type must be set now */ if (client->type != client_info->type) return -EINVAL; /* check validity of midi_version field */ if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3) && client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) return -EINVAL; /* fill the info fields */ if (client_info->name[0]) strscpy(client->name, client_info->name, sizeof(client->name)); client->filter = client_info->filter; client->event_lost = client_info->event_lost; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) client->midi_version = client_info->midi_version; memcpy(client->event_filter, client_info->event_filter, 32); client->group_filter = client_info->group_filter; return 0; } /* * CREATE PORT ioctl() */ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client_port *port; struct snd_seq_port_callback *callback; int port_idx, err; /* it is not allowed to create the port for an another client */ if (info->addr.client != client->number) return -EPERM; if (client->type == USER_CLIENT && info->kernel) return -EINVAL; if ((info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) && client->ump_endpoint_port >= 0) return -EBUSY; if (info->flags & SNDRV_SEQ_PORT_FLG_GIVEN_PORT) port_idx = info->addr.port; else port_idx = -1; if (port_idx >= SNDRV_SEQ_ADDRESS_UNKNOWN) return -EINVAL; err = snd_seq_create_port(client, port_idx, &port); if (err < 0) return err; if (client->type == KERNEL_CLIENT) { callback = info->kernel; if (callback) { if (callback->owner) port->owner = callback->owner; port->private_data = callback->private_data; port->private_free = callback->private_free; port->event_input = callback->event_input; port->c_src.open = callback->subscribe; port->c_src.close = callback->unsubscribe; port->c_dest.open = callback->use; port->c_dest.close = callback->unuse; } } info->addr = port->addr; snd_seq_set_port_info(port, info); if (info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) client->ump_endpoint_port = port->addr.port; snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port); snd_seq_port_unlock(port); return 0; } /* * DELETE PORT ioctl() */ static int snd_seq_ioctl_delete_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; int err; /* it is not allowed to remove the port for an another client */ if (info->addr.client != client->number) return -EPERM; err = snd_seq_delete_port(client, info->addr.port); if (err >= 0) { if (client->ump_endpoint_port == info->addr.port) client->ump_endpoint_port = -1; snd_seq_system_client_ev_port_exit(client->number, info->addr.port); } return err; } /* * GET_PORT_INFO ioctl() (on any client) */ static int snd_seq_ioctl_get_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr; struct snd_seq_client_port *port; cptr = snd_seq_client_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; port = snd_seq_port_use_ptr(cptr, info->addr.port); if (port == NULL) { snd_seq_client_unlock(cptr); return -ENOENT; /* don't change */ } /* get port info */ snd_seq_get_port_info(port, info); snd_seq_port_unlock(port); snd_seq_client_unlock(cptr); return 0; } /* * SET_PORT_INFO ioctl() (only ports on this/own client) */ static int snd_seq_ioctl_set_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client_port *port; if (info->addr.client != client->number) /* only set our own ports ! */ return -EPERM; port = snd_seq_port_use_ptr(client, info->addr.port); if (port) { snd_seq_set_port_info(port, info); snd_seq_port_unlock(port); } return 0; } /* * port subscription (connection) */ #define PERM_RD (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) #define PERM_WR (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) static int check_subscription_permission(struct snd_seq_client *client, struct snd_seq_client_port *sport, struct snd_seq_client_port *dport, struct snd_seq_port_subscribe *subs) { if (client->number != subs->sender.client && client->number != subs->dest.client) { /* connection by third client - check export permission */ if (check_port_perm(sport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; if (check_port_perm(dport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; } /* check read permission */ /* if sender or receiver is the subscribing client itself, * no permission check is necessary */ if (client->number != subs->sender.client) { if (! check_port_perm(sport, PERM_RD)) return -EPERM; } /* check write permission */ if (client->number != subs->dest.client) { if (! check_port_perm(dport, PERM_WR)) return -EPERM; } return 0; } /* * send an subscription notify event to user client: * client must be user client. */ int snd_seq_client_notify_subscription(int client, int port, struct snd_seq_port_subscribe *info, int evtype) { struct snd_seq_event event; memset(&event, 0, sizeof(event)); event.type = evtype; event.data.connect.dest = info->dest; event.data.connect.sender = info->sender; return snd_seq_system_notify(client, port, &event); /* non-atomic */ } /* * add to port's subscription list IOCTL interface */ static int snd_seq_ioctl_subscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result = -EINVAL; struct snd_seq_client *receiver = NULL, *sender = NULL; struct snd_seq_client_port *sport = NULL, *dport = NULL; receiver = snd_seq_client_use_ptr(subs->dest.client); if (!receiver) goto __end; sender = snd_seq_client_use_ptr(subs->sender.client); if (!sender) goto __end; sport = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) goto __end; dport = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) goto __end; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) goto __end; /* connect them */ result = snd_seq_port_connect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); __end: if (sport) snd_seq_port_unlock(sport); if (dport) snd_seq_port_unlock(dport); if (sender) snd_seq_client_unlock(sender); if (receiver) snd_seq_client_unlock(receiver); return result; } /* * remove from port's subscription list */ static int snd_seq_ioctl_unsubscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result = -ENXIO; struct snd_seq_client *receiver = NULL, *sender = NULL; struct snd_seq_client_port *sport = NULL, *dport = NULL; receiver = snd_seq_client_use_ptr(subs->dest.client); if (!receiver) goto __end; sender = snd_seq_client_use_ptr(subs->sender.client); if (!sender) goto __end; sport = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) goto __end; dport = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) goto __end; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) goto __end; result = snd_seq_port_disconnect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); __end: if (sport) snd_seq_port_unlock(sport); if (dport) snd_seq_port_unlock(dport); if (sender) snd_seq_client_unlock(sender); if (receiver) snd_seq_client_unlock(receiver); return result; } /* CREATE_QUEUE ioctl() */ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q; q = snd_seq_queue_alloc(client->number, info->locked, info->flags); if (IS_ERR(q)) return PTR_ERR(q); info->queue = q->queue; info->locked = q->locked; info->owner = q->owner; /* set queue name */ if (!info->name[0]) snprintf(info->name, sizeof(info->name), "Queue-%d", q->queue); strscpy(q->name, info->name, sizeof(q->name)); snd_use_lock_free(&q->use_lock); return 0; } /* DELETE_QUEUE ioctl() */ static int snd_seq_ioctl_delete_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; return snd_seq_queue_delete(client->number, info->queue); } /* GET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_get_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q; q = queueptr(info->queue); if (q == NULL) return -EINVAL; memset(info, 0, sizeof(*info)); info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; strscpy(info->name, q->name, sizeof(info->name)); queuefree(q); return 0; } /* SET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_set_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q; if (info->owner != client->number) return -EINVAL; /* change owner/locked permission */ if (snd_seq_queue_check_access(info->queue, client->number)) { if (snd_seq_queue_set_owner(info->queue, client->number, info->locked) < 0) return -EPERM; if (info->locked) snd_seq_queue_use(info->queue, client->number, 1); } else { return -EPERM; } q = queueptr(info->queue); if (! q) return -EINVAL; if (q->owner != client->number) { queuefree(q); return -EPERM; } strscpy(q->name, info->name, sizeof(q->name)); queuefree(q); return 0; } /* GET_NAMED_QUEUE ioctl() */ static int snd_seq_ioctl_get_named_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q; q = snd_seq_queue_find_name(info->name); if (q == NULL) return -EINVAL; info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; queuefree(q); return 0; } /* GET_QUEUE_STATUS ioctl() */ static int snd_seq_ioctl_get_queue_status(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_status *status = arg; struct snd_seq_queue *queue; struct snd_seq_timer *tmr; queue = queueptr(status->queue); if (queue == NULL) return -EINVAL; memset(status, 0, sizeof(*status)); status->queue = queue->queue; tmr = queue->timer; status->events = queue->tickq->cells + queue->timeq->cells; status->time = snd_seq_timer_get_cur_time(tmr, true); status->tick = snd_seq_timer_get_cur_tick(tmr); status->running = tmr->running; status->flags = queue->flags; queuefree(queue); return 0; } /* GET_QUEUE_TEMPO ioctl() */ static int snd_seq_ioctl_get_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; struct snd_seq_queue *queue; struct snd_seq_timer *tmr; queue = queueptr(tempo->queue); if (queue == NULL) return -EINVAL; memset(tempo, 0, sizeof(*tempo)); tempo->queue = queue->queue; tmr = queue->timer; tempo->tempo = tmr->tempo; tempo->ppq = tmr->ppq; tempo->skew_value = tmr->skew; tempo->skew_base = tmr->skew_base; queuefree(queue); return 0; } /* SET_QUEUE_TEMPO ioctl() */ int snd_seq_set_queue_tempo(int client, struct snd_seq_queue_tempo *tempo) { if (!snd_seq_queue_check_access(tempo->queue, client)) return -EPERM; return snd_seq_queue_timer_set_tempo(tempo->queue, client, tempo); } EXPORT_SYMBOL(snd_seq_set_queue_tempo); static int snd_seq_ioctl_set_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; int result; result = snd_seq_set_queue_tempo(client->number, tempo); return result < 0 ? result : 0; } /* GET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_get_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; struct snd_seq_queue *queue; struct snd_seq_timer *tmr; queue = queueptr(timer->queue); if (queue == NULL) return -EINVAL; mutex_lock(&queue->timer_mutex); tmr = queue->timer; memset(timer, 0, sizeof(*timer)); timer->queue = queue->queue; timer->type = tmr->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { timer->u.alsa.id = tmr->alsa_id; timer->u.alsa.resolution = tmr->preferred_resolution; } mutex_unlock(&queue->timer_mutex); queuefree(queue); return 0; } /* SET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_set_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; int result = 0; if (timer->type != SNDRV_SEQ_TIMER_ALSA) return -EINVAL; if (snd_seq_queue_check_access(timer->queue, client->number)) { struct snd_seq_queue *q; struct snd_seq_timer *tmr; q = queueptr(timer->queue); if (q == NULL) return -ENXIO; mutex_lock(&q->timer_mutex); tmr = q->timer; snd_seq_queue_timer_close(timer->queue); tmr->type = timer->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { tmr->alsa_id = timer->u.alsa.id; tmr->preferred_resolution = timer->u.alsa.resolution; } result = snd_seq_queue_timer_open(timer->queue); mutex_unlock(&q->timer_mutex); queuefree(q); } else { return -EPERM; } return result; } /* GET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_get_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int used; used = snd_seq_queue_is_used(info->queue, client->number); if (used < 0) return -EINVAL; info->used = used; info->client = client->number; return 0; } /* SET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_set_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int err; if (info->used >= 0) { err = snd_seq_queue_use(info->queue, client->number, info->used); if (err < 0) return err; } return snd_seq_ioctl_get_queue_client(client, arg); } /* GET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_get_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; struct snd_seq_client *cptr; cptr = snd_seq_client_use_ptr(info->client); if (cptr == NULL) return -ENOENT; memset(info, 0, sizeof(*info)); info->client = cptr->number; info->output_pool = cptr->pool->size; info->output_room = cptr->pool->room; info->output_free = info->output_pool; info->output_free = snd_seq_unused_cells(cptr->pool); if (cptr->type == USER_CLIENT) { info->input_pool = cptr->data.user.fifo_pool_size; info->input_free = info->input_pool; info->input_free = snd_seq_fifo_unused_cells(cptr->data.user.fifo); } else { info->input_pool = 0; info->input_free = 0; } snd_seq_client_unlock(cptr); return 0; } /* SET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_set_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; int rc; if (client->number != info->client) return -EINVAL; /* can't change other clients */ if (info->output_pool >= 1 && info->output_pool <= SNDRV_SEQ_MAX_EVENTS && (! snd_seq_write_pool_allocated(client) || info->output_pool != client->pool->size)) { if (snd_seq_write_pool_allocated(client)) { /* is the pool in use? */ if (atomic_read(&client->pool->counter)) return -EBUSY; /* remove all existing cells */ snd_seq_pool_mark_closing(client->pool); snd_seq_pool_done(client->pool); } client->pool->size = info->output_pool; rc = snd_seq_pool_init(client->pool); if (rc < 0) return rc; } if (client->type == USER_CLIENT && client->data.user.fifo != NULL && info->input_pool >= 1 && info->input_pool <= SNDRV_SEQ_MAX_CLIENT_EVENTS && info->input_pool != client->data.user.fifo_pool_size) { /* change pool size */ rc = snd_seq_fifo_resize(client->data.user.fifo, info->input_pool); if (rc < 0) return rc; client->data.user.fifo_pool_size = info->input_pool; } if (info->output_room >= 1 && info->output_room <= client->pool->size) { client->pool->room = info->output_room; } return snd_seq_ioctl_get_client_pool(client, arg); } /* REMOVE_EVENTS ioctl() */ static int snd_seq_ioctl_remove_events(struct snd_seq_client *client, void *arg) { struct snd_seq_remove_events *info = arg; /* * Input mostly not implemented XXX. */ if (info->remove_mode & SNDRV_SEQ_REMOVE_INPUT) { /* * No restrictions so for a user client we can clear * the whole fifo */ if (client->type == USER_CLIENT && client->data.user.fifo) snd_seq_fifo_clear(client->data.user.fifo); } if (info->remove_mode & SNDRV_SEQ_REMOVE_OUTPUT) snd_seq_queue_remove_cells(client->number, info); return 0; } /* * get subscription info */ static int snd_seq_ioctl_get_subscription(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result; struct snd_seq_client *sender = NULL; struct snd_seq_client_port *sport = NULL; result = -EINVAL; sender = snd_seq_client_use_ptr(subs->sender.client); if (!sender) goto __end; sport = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) goto __end; result = snd_seq_port_get_subscription(&sport->c_src, &subs->dest, subs); __end: if (sport) snd_seq_port_unlock(sport); if (sender) snd_seq_client_unlock(sender); return result; } /* * get subscription info - check only its presence */ static int snd_seq_ioctl_query_subs(struct snd_seq_client *client, void *arg) { struct snd_seq_query_subs *subs = arg; int result = -ENXIO; struct snd_seq_client *cptr = NULL; struct snd_seq_client_port *port = NULL; struct snd_seq_port_subs_info *group; struct list_head *p; int i; cptr = snd_seq_client_use_ptr(subs->root.client); if (!cptr) goto __end; port = snd_seq_port_use_ptr(cptr, subs->root.port); if (!port) goto __end; switch (subs->type) { case SNDRV_SEQ_QUERY_SUBS_READ: group = &port->c_src; break; case SNDRV_SEQ_QUERY_SUBS_WRITE: group = &port->c_dest; break; default: goto __end; } down_read(&group->list_mutex); /* search for the subscriber */ subs->num_subs = group->count; i = 0; result = -ENOENT; list_for_each(p, &group->list_head) { if (i++ == subs->index) { /* found! */ struct snd_seq_subscribers *s; if (subs->type == SNDRV_SEQ_QUERY_SUBS_READ) { s = list_entry(p, struct snd_seq_subscribers, src_list); subs->addr = s->info.dest; } else { s = list_entry(p, struct snd_seq_subscribers, dest_list); subs->addr = s->info.sender; } subs->flags = s->info.flags; subs->queue = s->info.queue; result = 0; break; } } up_read(&group->list_mutex); __end: if (port) snd_seq_port_unlock(port); if (cptr) snd_seq_client_unlock(cptr); return result; } /* * query next client */ static int snd_seq_ioctl_query_next_client(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *info = arg; struct snd_seq_client *cptr = NULL; /* search for next client */ if (info->client < INT_MAX) info->client++; if (info->client < 0) info->client = 0; for (; info->client < SNDRV_SEQ_MAX_CLIENTS; info->client++) { cptr = snd_seq_client_use_ptr(info->client); if (cptr) break; /* found */ } if (cptr == NULL) return -ENOENT; get_client_info(cptr, info); snd_seq_client_unlock(cptr); return 0; } /* * query next port */ static int snd_seq_ioctl_query_next_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr; struct snd_seq_client_port *port = NULL; cptr = snd_seq_client_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; /* search for next port */ info->addr.port++; port = snd_seq_port_query_nearest(cptr, info); if (port == NULL) { snd_seq_client_unlock(cptr); return -ENOENT; } /* get port info */ info->addr = port->addr; snd_seq_get_port_info(port, info); snd_seq_port_unlock(port); snd_seq_client_unlock(cptr); return 0; } #if IS_ENABLED(CONFIG_SND_SEQ_UMP) #define NUM_UMP_INFOS (SNDRV_UMP_MAX_BLOCKS + 1) static void free_ump_info(struct snd_seq_client *client) { int i; if (!client->ump_info) return; for (i = 0; i < NUM_UMP_INFOS; i++) kfree(client->ump_info[i]); kfree(client->ump_info); client->ump_info = NULL; } static void terminate_ump_info_strings(void *p, int type) { if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) { struct snd_ump_endpoint_info *ep = p; ep->name[sizeof(ep->name) - 1] = 0; } else { struct snd_ump_block_info *bp = p; bp->name[sizeof(bp->name) - 1] = 0; } } #ifdef CONFIG_SND_PROC_FS static void dump_ump_info(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_ump_endpoint_info *ep; struct snd_ump_block_info *bp; int i; if (!client->ump_info) return; ep = client->ump_info[SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT]; if (ep && *ep->name) snd_iprintf(buffer, " UMP Endpoint: \"%s\"\n", ep->name); for (i = 0; i < SNDRV_UMP_MAX_BLOCKS; i++) { bp = client->ump_info[i + 1]; if (bp && *bp->name) { snd_iprintf(buffer, " UMP Block %d: \"%s\" [%s]\n", i, bp->name, bp->active ? "Active" : "Inactive"); snd_iprintf(buffer, " Groups: %d-%d\n", bp->first_group + 1, bp->first_group + bp->num_groups); } } } #endif /* UMP-specific ioctls -- called directly without data copy */ static int snd_seq_ioctl_client_ump_info(struct snd_seq_client *caller, unsigned int cmd, unsigned long arg) { struct snd_seq_client_ump_info __user *argp = (struct snd_seq_client_ump_info __user *)arg; struct snd_seq_client *cptr; int client, type, err = 0; size_t size; void *p; if (get_user(client, &argp->client) || get_user(type, &argp->type)) return -EFAULT; if (cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO && caller->number != client) return -EPERM; if (type < 0 || type >= NUM_UMP_INFOS) return -EINVAL; if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) size = sizeof(struct snd_ump_endpoint_info); else size = sizeof(struct snd_ump_block_info); cptr = snd_seq_client_use_ptr(client); if (!cptr) return -ENOENT; mutex_lock(&cptr->ioctl_mutex); if (!cptr->midi_version) { err = -EBADFD; goto error; } if (cmd == SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO) { if (!cptr->ump_info) p = NULL; else p = cptr->ump_info[type]; if (!p) { err = -ENODEV; goto error; } if (copy_to_user(argp->info, p, size)) { err = -EFAULT; goto error; } } else { if (cptr->type != USER_CLIENT) { err = -EBADFD; goto error; } if (!cptr->ump_info) { cptr->ump_info = kcalloc(NUM_UMP_INFOS, sizeof(void *), GFP_KERNEL); if (!cptr->ump_info) { err = -ENOMEM; goto error; } } p = memdup_user(argp->info, size); if (IS_ERR(p)) { err = PTR_ERR(p); goto error; } kfree(cptr->ump_info[type]); terminate_ump_info_strings(p, type); cptr->ump_info[type] = p; } error: mutex_unlock(&cptr->ioctl_mutex); snd_seq_client_unlock(cptr); return err; } #endif /* -------------------------------------------------------- */ static const struct ioctl_handler { unsigned int cmd; int (*func)(struct snd_seq_client *client, void *arg); } ioctl_handlers[] = { { SNDRV_SEQ_IOCTL_PVERSION, snd_seq_ioctl_pversion }, { SNDRV_SEQ_IOCTL_USER_PVERSION, snd_seq_ioctl_user_pversion }, { SNDRV_SEQ_IOCTL_CLIENT_ID, snd_seq_ioctl_client_id }, { SNDRV_SEQ_IOCTL_SYSTEM_INFO, snd_seq_ioctl_system_info }, { SNDRV_SEQ_IOCTL_RUNNING_MODE, snd_seq_ioctl_running_mode }, { SNDRV_SEQ_IOCTL_GET_CLIENT_INFO, snd_seq_ioctl_get_client_info }, { SNDRV_SEQ_IOCTL_SET_CLIENT_INFO, snd_seq_ioctl_set_client_info }, { SNDRV_SEQ_IOCTL_CREATE_PORT, snd_seq_ioctl_create_port }, { SNDRV_SEQ_IOCTL_DELETE_PORT, snd_seq_ioctl_delete_port }, { SNDRV_SEQ_IOCTL_GET_PORT_INFO, snd_seq_ioctl_get_port_info }, { SNDRV_SEQ_IOCTL_SET_PORT_INFO, snd_seq_ioctl_set_port_info }, { SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, snd_seq_ioctl_subscribe_port }, { SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, snd_seq_ioctl_unsubscribe_port }, { SNDRV_SEQ_IOCTL_CREATE_QUEUE, snd_seq_ioctl_create_queue }, { SNDRV_SEQ_IOCTL_DELETE_QUEUE, snd_seq_ioctl_delete_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_INFO, snd_seq_ioctl_get_queue_info }, { SNDRV_SEQ_IOCTL_SET_QUEUE_INFO, snd_seq_ioctl_set_queue_info }, { SNDRV_SEQ_IOCTL_GET_NAMED_QUEUE, snd_seq_ioctl_get_named_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_STATUS, snd_seq_ioctl_get_queue_status }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TEMPO, snd_seq_ioctl_get_queue_tempo }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TEMPO, snd_seq_ioctl_set_queue_tempo }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TIMER, snd_seq_ioctl_get_queue_timer }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TIMER, snd_seq_ioctl_set_queue_timer }, { SNDRV_SEQ_IOCTL_GET_QUEUE_CLIENT, snd_seq_ioctl_get_queue_client }, { SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT, snd_seq_ioctl_set_queue_client }, { SNDRV_SEQ_IOCTL_GET_CLIENT_POOL, snd_seq_ioctl_get_client_pool }, { SNDRV_SEQ_IOCTL_SET_CLIENT_POOL, snd_seq_ioctl_set_client_pool }, { SNDRV_SEQ_IOCTL_GET_SUBSCRIPTION, snd_seq_ioctl_get_subscription }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, snd_seq_ioctl_query_next_client }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, snd_seq_ioctl_query_next_port }, { SNDRV_SEQ_IOCTL_REMOVE_EVENTS, snd_seq_ioctl_remove_events }, { SNDRV_SEQ_IOCTL_QUERY_SUBS, snd_seq_ioctl_query_subs }, { 0, NULL }, }; static long snd_seq_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_seq_client *client = file->private_data; /* To use kernel stack for ioctl data. */ union { int pversion; int client_id; struct snd_seq_system_info system_info; struct snd_seq_running_info running_info; struct snd_seq_client_info client_info; struct snd_seq_port_info port_info; struct snd_seq_port_subscribe port_subscribe; struct snd_seq_queue_info queue_info; struct snd_seq_queue_status queue_status; struct snd_seq_queue_tempo tempo; struct snd_seq_queue_timer queue_timer; struct snd_seq_queue_client queue_client; struct snd_seq_client_pool client_pool; struct snd_seq_remove_events remove_events; struct snd_seq_query_subs query_subs; } buf; const struct ioctl_handler *handler; unsigned long size; int err; if (snd_BUG_ON(!client)) return -ENXIO; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) /* exception - handling large data */ switch (cmd) { case SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO: case SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO: return snd_seq_ioctl_client_ump_info(client, cmd, arg); } #endif for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) break; } if (handler->cmd == 0) return -ENOTTY; memset(&buf, 0, sizeof(buf)); /* * All of ioctl commands for ALSA sequencer get an argument of size * within 13 bits. We can safely pick up the size from the command. */ size = _IOC_SIZE(handler->cmd); if (handler->cmd & IOC_IN) { if (copy_from_user(&buf, (const void __user *)arg, size)) return -EFAULT; } mutex_lock(&client->ioctl_mutex); err = handler->func(client, &buf); mutex_unlock(&client->ioctl_mutex); if (err >= 0) { /* Some commands includes a bug in 'dir' field. */ if (handler->cmd == SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT || handler->cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_POOL || (handler->cmd & IOC_OUT)) if (copy_to_user((void __user *)arg, &buf, size)) return -EFAULT; } return err; } #ifdef CONFIG_COMPAT #include "seq_compat.c" #else #define snd_seq_ioctl_compat NULL #endif /* -------------------------------------------------------- */ /* exported to kernel modules */ int snd_seq_create_kernel_client(struct snd_card *card, int client_index, const char *name_fmt, ...) { struct snd_seq_client *client; va_list args; if (snd_BUG_ON(in_interrupt())) return -EBUSY; if (card && client_index >= SNDRV_SEQ_CLIENTS_PER_CARD) return -EINVAL; if (card == NULL && client_index >= SNDRV_SEQ_GLOBAL_CLIENTS) return -EINVAL; mutex_lock(&register_mutex); if (card) { client_index += SNDRV_SEQ_GLOBAL_CLIENTS + card->number * SNDRV_SEQ_CLIENTS_PER_CARD; if (client_index >= SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) client_index = -1; } /* empty write queue as default */ client = seq_create_client1(client_index, 0); if (client == NULL) { mutex_unlock(&register_mutex); return -EBUSY; /* failure code */ } usage_alloc(&client_usage, 1); client->accept_input = 1; client->accept_output = 1; client->data.kernel.card = card; client->user_pversion = SNDRV_SEQ_VERSION; va_start(args, name_fmt); vsnprintf(client->name, sizeof(client->name), name_fmt, args); va_end(args); client->type = KERNEL_CLIENT; mutex_unlock(&register_mutex); /* make others aware this new client */ snd_seq_system_client_ev_client_start(client->number); /* return client number to caller */ return client->number; } EXPORT_SYMBOL(snd_seq_create_kernel_client); /* exported to kernel modules */ int snd_seq_delete_kernel_client(int client) { struct snd_seq_client *ptr; if (snd_BUG_ON(in_interrupt())) return -EBUSY; ptr = clientptr(client); if (ptr == NULL) return -EINVAL; seq_free_client(ptr); kfree(ptr); return 0; } EXPORT_SYMBOL(snd_seq_delete_kernel_client); /* * exported, called by kernel clients to enqueue events (w/o blocking) * * RETURN VALUE: zero if succeed, negative if error */ int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, struct file *file, bool blocking) { struct snd_seq_client *cptr; int result; if (snd_BUG_ON(!ev)) return -EINVAL; if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) return 0; /* ignore this */ if (ev->type == SNDRV_SEQ_EVENT_KERNEL_ERROR) return -EINVAL; /* quoted events can't be enqueued */ } /* fill in client number */ ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; cptr = snd_seq_client_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) { result = -EPERM; } else { /* send it */ mutex_lock(&cptr->ioctl_mutex); result = snd_seq_client_enqueue_event(cptr, ev, file, blocking, false, 0, &cptr->ioctl_mutex); mutex_unlock(&cptr->ioctl_mutex); } snd_seq_client_unlock(cptr); return result; } EXPORT_SYMBOL(snd_seq_kernel_client_enqueue); /* * exported, called by kernel clients to dispatch events directly to other * clients, bypassing the queues. Event time-stamp will be updated. * * RETURN VALUE: negative = delivery failed, * zero, or positive: the number of delivered events */ int snd_seq_kernel_client_dispatch(int client, struct snd_seq_event * ev, int atomic, int hop) { struct snd_seq_client *cptr; int result; if (snd_BUG_ON(!ev)) return -EINVAL; /* fill in client number */ ev->queue = SNDRV_SEQ_QUEUE_DIRECT; ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; cptr = snd_seq_client_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) result = -EPERM; else result = snd_seq_deliver_event(cptr, ev, atomic, hop); snd_seq_client_unlock(cptr); return result; } EXPORT_SYMBOL(snd_seq_kernel_client_dispatch); /** * snd_seq_kernel_client_ctl - operate a command for a client with data in * kernel space. * @clientid: A numerical ID for a client. * @cmd: An ioctl(2) command for ALSA sequencer operation. * @arg: A pointer to data in kernel space. * * Against its name, both kernel/application client can be handled by this * kernel API. A pointer of 'arg' argument should be in kernel space. * * Return: 0 at success. Negative error code at failure. */ int snd_seq_kernel_client_ctl(int clientid, unsigned int cmd, void *arg) { const struct ioctl_handler *handler; struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) return handler->func(client, arg); } pr_debug("ALSA: seq unknown ioctl() 0x%x (type='%c', number=0x%02x)\n", cmd, _IOC_TYPE(cmd), _IOC_NR(cmd)); return -ENOTTY; } EXPORT_SYMBOL(snd_seq_kernel_client_ctl); /* exported (for OSS emulator) */ int snd_seq_kernel_client_write_poll(int clientid, struct file *file, poll_table *wait) { struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; if (! snd_seq_write_pool_allocated(client)) return 1; if (snd_seq_pool_poll_wait(client->pool, file, wait)) return 1; return 0; } EXPORT_SYMBOL(snd_seq_kernel_client_write_poll); /* get a sequencer client object; for internal use from a kernel client */ struct snd_seq_client *snd_seq_kernel_client_get(int id) { return snd_seq_client_use_ptr(id); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_get); /* put a sequencer client object; for internal use from a kernel client */ void snd_seq_kernel_client_put(struct snd_seq_client *cptr) { if (cptr) snd_seq_client_unlock(cptr); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_put); /*---------------------------------------------------------------------------*/ #ifdef CONFIG_SND_PROC_FS /* * /proc interface */ static void snd_seq_info_dump_subscribers(struct snd_info_buffer *buffer, struct snd_seq_port_subs_info *group, int is_src, char *msg) { struct list_head *p; struct snd_seq_subscribers *s; int count = 0; down_read(&group->list_mutex); if (list_empty(&group->list_head)) { up_read(&group->list_mutex); return; } snd_iprintf(buffer, msg); list_for_each(p, &group->list_head) { if (is_src) s = list_entry(p, struct snd_seq_subscribers, src_list); else s = list_entry(p, struct snd_seq_subscribers, dest_list); if (count++) snd_iprintf(buffer, ", "); snd_iprintf(buffer, "%d:%d", is_src ? s->info.dest.client : s->info.sender.client, is_src ? s->info.dest.port : s->info.sender.port); if (s->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) snd_iprintf(buffer, "[%c:%d]", ((s->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL) ? 'r' : 't'), s->info.queue); if (group->exclusive) snd_iprintf(buffer, "[ex]"); } up_read(&group->list_mutex); snd_iprintf(buffer, "\n"); } #define FLAG_PERM_RD(perm) ((perm) & SNDRV_SEQ_PORT_CAP_READ ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_READ ? 'R' : 'r') : '-') #define FLAG_PERM_WR(perm) ((perm) & SNDRV_SEQ_PORT_CAP_WRITE ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_WRITE ? 'W' : 'w') : '-') #define FLAG_PERM_EX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_NO_EXPORT ? '-' : 'e') #define FLAG_PERM_DUPLEX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_DUPLEX ? 'X' : '-') static const char *port_direction_name(unsigned char dir) { static const char *names[4] = { "-", "In", "Out", "In/Out" }; if (dir > SNDRV_SEQ_PORT_DIR_BIDIRECTION) return "Invalid"; return names[dir]; } static void snd_seq_info_dump_ports(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_seq_client_port *p; mutex_lock(&client->ports_mutex); list_for_each_entry(p, &client->ports_list_head, list) { if (p->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) continue; snd_iprintf(buffer, " Port %3d : \"%s\" (%c%c%c%c) [%s]\n", p->addr.port, p->name, FLAG_PERM_RD(p->capability), FLAG_PERM_WR(p->capability), FLAG_PERM_EX(p->capability), FLAG_PERM_DUPLEX(p->capability), port_direction_name(p->direction)); snd_seq_info_dump_subscribers(buffer, &p->c_src, 1, " Connecting To: "); snd_seq_info_dump_subscribers(buffer, &p->c_dest, 0, " Connected From: "); } mutex_unlock(&client->ports_mutex); } static const char *midi_version_string(unsigned int version) { switch (version) { case SNDRV_SEQ_CLIENT_LEGACY_MIDI: return "Legacy"; case SNDRV_SEQ_CLIENT_UMP_MIDI_1_0: return "UMP MIDI1"; case SNDRV_SEQ_CLIENT_UMP_MIDI_2_0: return "UMP MIDI2"; default: return "Unknown"; } } /* exported to seq_info.c */ void snd_seq_info_clients_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int c; struct snd_seq_client *client; snd_iprintf(buffer, "Client info\n"); snd_iprintf(buffer, " cur clients : %d\n", client_usage.cur); snd_iprintf(buffer, " peak clients : %d\n", client_usage.peak); snd_iprintf(buffer, " max clients : %d\n", SNDRV_SEQ_MAX_CLIENTS); snd_iprintf(buffer, "\n"); /* list the client table */ for (c = 0; c < SNDRV_SEQ_MAX_CLIENTS; c++) { client = snd_seq_client_use_ptr(c); if (client == NULL) continue; if (client->type == NO_CLIENT) { snd_seq_client_unlock(client); continue; } snd_iprintf(buffer, "Client %3d : \"%s\" [%s %s]\n", c, client->name, client->type == USER_CLIENT ? "User" : "Kernel", midi_version_string(client->midi_version)); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) dump_ump_info(buffer, client); #endif snd_seq_info_dump_ports(buffer, client); if (snd_seq_write_pool_allocated(client)) { snd_iprintf(buffer, " Output pool :\n"); snd_seq_info_pool(buffer, client->pool, " "); } if (client->type == USER_CLIENT && client->data.user.fifo && client->data.user.fifo->pool) { snd_iprintf(buffer, " Input pool :\n"); snd_seq_info_pool(buffer, client->data.user.fifo->pool, " "); } snd_seq_client_unlock(client); } } #endif /* CONFIG_SND_PROC_FS */ /*---------------------------------------------------------------------------*/ /* * REGISTRATION PART */ static const struct file_operations snd_seq_f_ops = { .owner = THIS_MODULE, .read = snd_seq_read, .write = snd_seq_write, .open = snd_seq_open, .release = snd_seq_release, .llseek = no_llseek, .poll = snd_seq_poll, .unlocked_ioctl = snd_seq_ioctl, .compat_ioctl = snd_seq_ioctl_compat, }; static struct device *seq_dev; /* * register sequencer device */ int __init snd_sequencer_device_init(void) { int err; err = snd_device_alloc(&seq_dev, NULL); if (err < 0) return err; dev_set_name(seq_dev, "seq"); mutex_lock(&register_mutex); err = snd_register_device(SNDRV_DEVICE_TYPE_SEQUENCER, NULL, 0, &snd_seq_f_ops, NULL, seq_dev); mutex_unlock(&register_mutex); if (err < 0) { put_device(seq_dev); return err; } return 0; } /* * unregister sequencer device */ void snd_sequencer_device_done(void) { snd_unregister_device(seq_dev); put_device(seq_dev); }
5 5 4 4 4 6 6 4 2 2 2 4 4 2 2 2 2 2 10 10 10 10 1 10 10 1 10 6 6 6 4 4 2 2 10 4 1 8 8 2 2 2 2 6 6 4 4 2 2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CT.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) { /* Previously seen (loopback)? Ignore. */ if (skb->_nfct != 0) return XT_CONTINUE; if (ct) { refcount_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } else { nf_ct_set(skb, ct, IP_CT_UNTRACKED); } return XT_CONTINUE; } static unsigned int xt_ct_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_ct_target_info *info = par->targinfo; struct nf_conn *ct = info->ct; return xt_ct_target(skb, ct); } static unsigned int xt_ct_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_ct_target_info_v1 *info = par->targinfo; struct nf_conn *ct = info->ct; return xt_ct_target(skb, ct); } static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) { if (par->family == NFPROTO_IPV4) { const struct ipt_entry *e = par->entryinfo; if (e->ip.invflags & IPT_INV_PROTO) return 0; return e->ip.proto; } else if (par->family == NFPROTO_IPV6) { const struct ip6t_entry *e = par->entryinfo; if (e->ipv6.invflags & IP6T_INV_PROTO) return 0; return e->ipv6.proto; } else return 0; } static int xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, const struct xt_tgchk_param *par) { struct nf_conntrack_helper *helper; struct nf_conn_help *help; u8 proto; proto = xt_ct_find_proto(par); if (!proto) { pr_info_ratelimited("You must specify a L4 protocol and not use inversions on it\n"); return -ENOENT; } helper = nf_conntrack_helper_try_module_get(helper_name, par->family, proto); if (helper == NULL) { pr_info_ratelimited("No such helper \"%s\"\n", helper_name); return -ENOENT; } help = nf_ct_helper_ext_add(ct, GFP_KERNEL); if (help == NULL) { nf_conntrack_helper_put(helper); return -ENOMEM; } rcu_assign_pointer(help->helper, helper); return 0; } static int xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, const char *timeout_name) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT const struct nf_conntrack_l4proto *l4proto; u8 proto; proto = xt_ct_find_proto(par); if (!proto) { pr_info_ratelimited("You must specify a L4 protocol and not " "use inversions on it"); return -EINVAL; } l4proto = nf_ct_l4proto_find(proto); return nf_ct_set_timeout(par->net, ct, par->family, l4proto->l4proto, timeout_name); #else return -EOPNOTSUPP; #endif } static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info) { switch (info->flags & (XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL)) { case XT_CT_ZONE_DIR_ORIG: return NF_CT_ZONE_DIR_ORIG; case XT_CT_ZONE_DIR_REPL: return NF_CT_ZONE_DIR_REPL; default: return NF_CT_DEFAULT_ZONE_DIR; } } static void xt_ct_put_helper(struct nf_conn_help *help) { struct nf_conntrack_helper *helper; if (!help) return; /* not yet exposed to other cpus, or ruleset * already detached (post-replacement). */ helper = rcu_dereference_raw(help->helper); if (helper) nf_conntrack_helper_put(helper); } static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; if (info->flags & XT_CT_NOTRACK) { ct = NULL; goto out; } #ifndef CONFIG_NF_CONNTRACK_ZONES if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL | XT_CT_ZONE_MARK)) goto err1; #endif ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) goto err1; memset(&zone, 0, sizeof(zone)); zone.id = info->zone; zone.dir = xt_ct_flags_to_dir(info); if (info->flags & XT_CT_ZONE_MARK) zone.flags |= NF_CT_FLAG_MARK; ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); if (!ct) { ret = -ENOMEM; goto err2; } if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, GFP_KERNEL)) { ret = -EINVAL; goto err3; } if (info->helper[0]) { if (strnlen(info->helper, sizeof(info->helper)) == sizeof(info->helper)) { ret = -ENAMETOOLONG; goto err3; } ret = xt_ct_set_helper(ct, info->helper, par); if (ret < 0) goto err3; } if (info->timeout[0]) { if (strnlen(info->timeout, sizeof(info->timeout)) == sizeof(info->timeout)) { ret = -ENAMETOOLONG; goto err4; } ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); out: info->ct = ct; return 0; err4: help = nfct_help(ct); xt_ct_put_helper(help); err3: nf_ct_tmpl_free(ct); err2: nf_ct_netns_put(par->net, par->family); err1: return ret; } static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) { struct xt_ct_target_info *info = par->targinfo; struct xt_ct_target_info_v1 info_v1 = { .flags = info->flags, .zone = info->zone, .ct_events = info->ct_events, .exp_events = info->exp_events, }; int ret; if (info->flags & ~XT_CT_NOTRACK) return -EINVAL; memcpy(info_v1.helper, info->helper, sizeof(info->helper)); ret = xt_ct_tg_check(par, &info_v1); if (ret < 0) return ret; info->ct = info_v1.ct; return ret; } static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) { struct xt_ct_target_info_v1 *info = par->targinfo; if (info->flags & ~XT_CT_NOTRACK) return -EINVAL; return xt_ct_tg_check(par, par->targinfo); } static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) { struct xt_ct_target_info_v1 *info = par->targinfo; if (info->flags & ~XT_CT_MASK) return -EINVAL; return xt_ct_tg_check(par, par->targinfo); } static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conn *ct = info->ct; struct nf_conn_help *help; if (ct) { help = nfct_help(ct); xt_ct_put_helper(help); nf_ct_netns_put(par->net, par->family); nf_ct_destroy_timeout(ct); nf_ct_put(info->ct); } } static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) { struct xt_ct_target_info *info = par->targinfo; struct xt_ct_target_info_v1 info_v1 = { .flags = info->flags, .zone = info->zone, .ct_events = info->ct_events, .exp_events = info->exp_events, .ct = info->ct, }; memcpy(info_v1.helper, info->helper, sizeof(info->helper)); xt_ct_tg_destroy(par, &info_v1); } static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) { xt_ct_tg_destroy(par, par->targinfo); } static struct xt_target xt_ct_tg_reg[] __read_mostly = { { .name = "CT", .family = NFPROTO_UNSPEC, .targetsize = sizeof(struct xt_ct_target_info), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, .destroy = xt_ct_tg_destroy_v0, .target = xt_ct_target_v0, .table = "raw", .me = THIS_MODULE, }, { .name = "CT", .family = NFPROTO_UNSPEC, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, .table = "raw", .me = THIS_MODULE, }, { .name = "CT", .family = NFPROTO_UNSPEC, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, .table = "raw", .me = THIS_MODULE, }, }; static unsigned int notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) { /* Previously seen (loopback)? Ignore. */ if (skb->_nfct != 0) return XT_CONTINUE; nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return XT_CONTINUE; } static struct xt_target notrack_tg_reg __read_mostly = { .name = "NOTRACK", .revision = 0, .family = NFPROTO_UNSPEC, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }; static int __init xt_ct_tg_init(void) { int ret; ret = xt_register_target(&notrack_tg_reg); if (ret < 0) return ret; ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); if (ret < 0) { xt_unregister_target(&notrack_tg_reg); return ret; } return 0; } static void __exit xt_ct_tg_exit(void) { xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); xt_unregister_target(&notrack_tg_reg); } module_init(xt_ct_tg_init); module_exit(xt_ct_tg_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: connection tracking target"); MODULE_ALIAS("ipt_CT"); MODULE_ALIAS("ip6t_CT"); MODULE_ALIAS("ipt_NOTRACK"); MODULE_ALIAS("ip6t_NOTRACK");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 /* gf128mul.h - GF(2^128) multiplication functions * * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org> * * Based on Dr Brian Gladman's (GPL'd) work published at * http://fp.gladman.plus.com/cryptography_technology/index.htm * See the original copyright notice below. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ /* --------------------------------------------------------------------------- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. LICENSE TERMS The free distribution and use of this software in both source and binary form is allowed (with or without changes) provided that: 1. distributions of this source code include the above copyright notice, this list of conditions and the following disclaimer; 2. distributions in binary form include the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other associated materials; 3. the copyright holder's name is not used to endorse products built using this software without specific written permission. ALTERNATIVELY, provided that this notice is retained in full, this product may be distributed under the terms of the GNU General Public License (GPL), in which case the provisions of the GPL apply INSTEAD OF those given above. DISCLAIMER This software is provided 'as is' with no explicit or implied warranties in respect of its properties, including, but not limited to, correctness and/or fitness for purpose. --------------------------------------------------------------------------- Issue Date: 31/01/2006 An implementation of field multiplication in Galois Field GF(2^128) */ #ifndef _CRYPTO_GF128MUL_H #define _CRYPTO_GF128MUL_H #include <asm/byteorder.h> #include <crypto/b128ops.h> #include <linux/slab.h> /* Comment by Rik: * * For some background on GF(2^128) see for example: * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can * be mapped to computer memory in a variety of ways. Let's examine * three common cases. * * Take a look at the 16 binary octets below in memory order. The msb's * are left and the lsb's are right. char b[16] is an array and b[0] is * the first octet. * * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 * b[0] b[1] b[2] b[3] b[13] b[14] b[15] * * Every bit is a coefficient of some power of X. We can store the bits * in every byte in little-endian order and the bytes themselves also in * little endian order. I will call this lle (little-little-endian). * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }. * This format was originally implemented in gf128mul and is used * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length). * * Another convention says: store the bits in bigendian order and the * bytes also. This is bbe (big-big-endian). Now the buffer above * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111, * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe * is partly implemented. * * Both of the above formats are easy to implement on big-endian * machines. * * XTS and EME (the latter of which is patent encumbered) use the ble * format (bits are stored in big endian order and the bytes in little * endian). The above buffer represents X^7 in this case and the * primitive polynomial is b[0] = 0x87. * * The common machine word-size is smaller than 128 bits, so to make * an efficient implementation we must split into machine word sizes. * This implementation uses 64-bit words for the moment. Machine * endianness comes into play. The lle format in relation to machine * endianness is discussed below by the original author of gf128mul Dr * Brian Gladman. * * Let's look at the bbe and ble format on a little endian machine. * * bbe on a little endian machine u32 x[4]: * * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 103..96 111.104 119.112 127.120 71...64 79...72 87...80 95...88 * * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 39...32 47...40 55...48 63...56 07...00 15...08 23...16 31...24 * * ble on a little endian machine * * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 31...24 23...16 15...08 07...00 63...56 55...48 47...40 39...32 * * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 95...88 87...80 79...72 71...64 127.120 199.112 111.104 103..96 * * Multiplications in GF(2^128) are mostly bit-shifts, so you see why * ble (and lbe also) are easier to implement on a little-endian * machine than on a big-endian machine. The converse holds for bbe * and lle. * * Note: to have good alignment, it seems to me that it is sufficient * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize * machines this will automatically aligned to wordsize and on a 64-bit * machine also. */ /* Multiply a GF(2^128) field element by x. Field elements are held in arrays of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower indexed bits placed in the more numerically significant bit positions within bytes. On little endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39 MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103 On big endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63 MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 */ /* A slow generic version of gf_mul, implemented for lle and bbe * It multiplies a and b and puts the result in a */ void gf128mul_lle(be128 *a, const be128 *b); void gf128mul_bbe(be128 *a, const be128 *b); /* * The following functions multiply a field element by x in * the polynomial field representation. They use 64-bit word operations * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. * * They are defined here for performance. */ static inline u64 gf128mul_mask_from_bit(u64 x, int which) { /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */ return ((s64)(x << (63 - which)) >> 63); } static inline void gf128mul_x_lle(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48 * (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56); r->b = cpu_to_be64((b >> 1) | (a << 63)); r->a = cpu_to_be64((a >> 1) ^ _tt); } static inline void gf128mul_x_bbe(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; r->a = cpu_to_be64((a << 1) | (b >> 63)); r->b = cpu_to_be64((b << 1) ^ _tt); } /* needed by XTS */ static inline void gf128mul_x_ble(le128 *r, const le128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */ u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; r->a = cpu_to_le64((a << 1) | (b >> 63)); r->b = cpu_to_le64((b << 1) ^ _tt); } /* 4k table optimization */ struct gf128mul_4k { be128 t[256]; }; struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t); void gf128mul_x8_ble(le128 *r, const le128 *x); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { kfree_sensitive(t); } /* 64k table optimization, implemented for bbe */ struct gf128mul_64k { struct gf128mul_4k *t[16]; }; /* First initialize with the constant factor with which you * want to multiply and then call gf128mul_64k_bbe with the other * factor in the first argument, and the table in the second. * Afterwards, the result is stored in *a. */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); void gf128mul_free_64k(struct gf128mul_64k *t); void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t); #endif /* _CRYPTO_GF128MUL_H */
28 4 6 6 3 6 1 1 1 1 1 3 3 3 2 1 1 3 22 22 22 22 22 22 3 3 2 3 2 2 2 2 3 3 3 3 3 3 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 /* * cgroup_freezer.c - control group freezer subsystem * * Copyright IBM Corporation, 2007 * * Author : Cedric Le Goater <clg@fr.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of * its ancestors has FREEZING_SELF set. */ enum freezer_state_flags { CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ /* mask for all FREEZING flags */ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { struct cgroup_subsys_state css; unsigned int state; }; static DEFINE_MUTEX(freezer_mutex); static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; } static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) { bool ret; unsigned int state; rcu_read_lock(); /* Check if the cgroup is still FREEZING, but not FROZEN. The extra * !FROZEN check is required, because the FREEZING bit is not cleared * when the state FROZEN is reached. */ state = task_freezer(task)->state; ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); rcu_read_unlock(); return ret; } static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) return "FROZEN"; if (state & CGROUP_FREEZING) return "FREEZING"; return "THAWED"; }; static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); if (!freezer) return ERR_PTR(-ENOMEM); return &freezer->css; } /** * freezer_css_online - commit creation of a freezer css * @css: css being created * * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); cpus_read_unlock(); return 0; } /** * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) { kfree(css_freezer(css)); } /* * Tasks can be migrated into a different freezer anytime regardless of its * current state. freezer_attach() is responsible for making new tasks * conform to the current state. * * Freezer state changes and task migration are synchronized via * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } } } mutex_unlock(&freezer_mutex); } /** * freezer_fork - cgroup post fork callback * @task: a task which has just been forked * * @task has just been created and should conform to the current state of * the cgroup_freezer it belongs to. This function may race against * freezer_attach(). Losing to freezer_attach() means that we don't have * to do anything as freezer_attach() will put @task into the appropriate * state. */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; /* * The root cgroup is non-freezable, so we can skip locking the * freezer. This is safe regardless of race with task migration. * If we didn't race or won, skipping is obviously the right thing * to do. If we lost and root is the new cgroup, noop is still the * right thing to do. */ if (task_css_is_root(task, freezer_cgrp_id)) return; mutex_lock(&freezer_mutex); rcu_read_lock(); freezer = task_freezer(task); if (freezer->state & CGROUP_FREEZING) freeze_task(task); rcu_read_unlock(); mutex_unlock(&freezer_mutex); } /** * update_if_frozen - update whether a cgroup finished freezing * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, * this function checks whether all tasks of this cgroup and the descendant * cgroups finished freezing and, if so, sets FROZEN. * * The caller is responsible for grabbing RCU read lock and calling * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) return; /* are all (live) children frozen? */ rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) { rcu_read_unlock(); return; } } rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task) && !frozen(task)) goto out_iter_end; } freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ css_for_each_descendant_post(pos, css) { if (!css_tryget_online(pos)) continue; rcu_read_unlock(); update_if_frozen(pos); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } static void freeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); } /** * freezer_apply_state - apply state change to a single cgroup_freezer * @freezer: freezer to apply state change to * @freeze: whether to freeze or unfreeze * @state: CGROUP_FREEZING_* flag to set or clear * * Set or clear @state on @cgroup according to @freeze, and perform * freezing or thawing as necessary. */ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { bool was_freezing = freezer->state & CGROUP_FREEZING; freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } } /** * freezer_change_state - change the freezing state of a cgroup_freezer * @freezer: freezer of interest * @freeze: whether to freeze or thaw * * Freeze or thaw @freezer according to @freeze. The operations are * recursive - all descendants of @freezer will be affected. */ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); if (!css_tryget_online(pos)) continue; rcu_read_unlock(); if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { bool freeze; buf = strstrip(buf); if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, .write = freezer_write, }, { .name = "self_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_self_freezing_read, }, { .name = "parent_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_parent_freezing_read, }, { } /* terminate */ }; struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, .legacy_cftypes = files, };
30 5 30 2 2 2 29 29 30 30 29 30 3 3 3 5 4 5 3 3 3 3 3 3 3 9 3 9 139 140 140 142 2 2 2 2 2 2 142 142 141 9 15 15 15 14 14 10 11 10 1 9 9 9 9 9 7 7 9 3 3 3 2 9 9 6 8 13 2 2 2 1 1 12 12 12 12 12 12 7 8 8 8 8 8 1 1 1 1 1 1 1 1 1 207 206 1 207 2 2 2 2 1 2 1 2 2 1 1 2 2 2 5 5 5 1 1 5 3 3 2 2 2 1 1 1 2 5 2 2 2 2 2 2 2 2 2 167 170 168 161 168 165 2 168 163 5 164 141 140 140 141 140 1 139 134 31 30 30 30 30 30 30 30 5 167 112 111 113 8 9 8 1 8 1 1 1 1 1 1 1 9 9 15 13 7 1 1 13 13 14 9 9 9 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 // SPDX-License-Identifier: GPL-2.0-only /* * fs/fs-writeback.c * * Copyright (C) 2002, Linus Torvalds. * * Contains all the functions related to writing back and waiting * upon dirty inodes against superblocks, and writing back dirty * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> #include <linux/memcontrol.h> #include "internal.h" /* * 4MB minimal write chunk size */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */ }; /* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its * timestamps updated and when they finally get written out to be two * dirtytime_expire_intervals. We set the default to 12 hours (in * seconds), which means most of the time inodes will have their * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_io_list); } /* * Include the creation of the trace points after defining the * wb_writeback_work structure and inline functions so that the definition * remains local to this file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); static bool wb_io_lists_populated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb)) { return false; } else { set_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth); return true; } } static void wb_io_lists_depopulated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { clear_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth) < 0); } } /** * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) return wb_io_lists_populated(wb); wb_io_lists_depopulated(wb); return false; } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_irq(&wb->work_lock); } /* * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just * set up a timer which wakes the bdi thread up later. * * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. * * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ static void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->dwork, timeout); spin_unlock_irq(&wb->work_lock); } static void finish_writeback_work(struct wb_writeback_work *work) { struct wb_completion *done = work->done; if (work->auto_free) kfree(work); if (done) { wait_queue_head_t *waitq = done->waitq; /* @done can't be accessed after the following dec */ if (atomic_dec_and_test(&done->cnt)) wake_up_all(waitq); } } static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); if (work->done) atomic_inc(&work->done->cnt); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); } else finish_writeback_work(work); spin_unlock_irq(&wb->work_lock); } /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field * set to @done, which should have been initialized with * DEFINE_WB_COMPLETION(). This function returns after all such work items * are completed. Work items which are waited upon aren't freed * automatically on completion. */ void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ wait_event(*done->waitq, !atomic_read(&done->cnt)); } #ifdef CONFIG_CGROUP_WRITEBACK /* * Parameters for foreign inode detection, see wbc_detach_inode() to see * how they're used. * * These paramters are inherently heuristical as the detection target * itself is fuzzy. All we want to do is detaching an inode from the * current owner if it's being written to by some other cgroups too much. * * The current cgroup writeback is built on the assumption that multiple * cgroups writing to the same inode concurrently is very rare and a mode * of operation which isn't well supported. As such, the goal is not * taking too long when a different cgroup takes over an inode while * avoiding too aggressive flip-flops from occasional foreign writes. * * We record, very roughly, 2s worth of IO time history and if more than * half of that is foreign, trigger the switch. The recording is quantized * to 16 slots. To avoid tiny writes from swinging the decision too much, * writes smaller than 1/8 of avg size are ignored. */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ #define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) /* each slot's duration is 2s / 16 */ #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ /* * Maximum inodes per isw. A specific value has been chosen to make * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. */ #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ / sizeof(struct inode *)) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (folio) { memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); } } if (!wb) wb = &bdi->wb; /* * There may be multiple instances of this function racing to * update the same inode. Use cmpxchg() to tell the winner. */ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list * @inode: inode of interest with i_lock held * @wb: target bdi_writeback * * Remove the inode from wb's io lists and if necessarily put onto b_attached * list. Only inodes attached to cgwb's are kept on this list. */ static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * * Returns @inode's wb with its list_lock held. @inode->i_lock must be * held on entry and is released on return. The returned wb is guaranteed * to stay @inode's associated wb until its list_lock is released. */ static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { while (true) { struct bdi_writeback *wb = inode_to_wb(inode); /* * inode_to_wb() association is protected by both * @inode->i_lock and @wb->list_lock but list_lock nests * outside i_lock. Drop i_lock and verify that the * association hasn't changed after acquiring list_lock. */ wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); /* i_wb may have changed inbetween, can't use inode_to_wb() */ if (likely(wb == inode->i_wb)) { wb_put(wb); /* @inode already has ref */ return wb; } spin_unlock(&wb->list_lock); wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } } /** * inode_to_wb_and_lock_list - determine an inode's wb and lock it * @inode: inode of interest * * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held * on entry. */ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { spin_lock(&inode->i_lock); return locked_inode_to_wb_and_lock_list(inode); } struct inode_switch_wbs_context { struct rcu_work work; /* * Multiple inodes can be switched at once. The switching procedure * consists of two parts, separated by a RCU grace period. To make * sure that the second part is executed for each inode gone through * the first part, all inode pointers are placed into a NULL-terminated * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ struct bdi_writeback *new_wb; struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { down_write(&bdi->wb_switch_rwsem); } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { up_write(&bdi->wb_switch_rwsem); } static bool inode_do_switch_wbs(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb) { struct address_space *mapping = inode->i_mapping; XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; bool switched = false; spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to * folios actually under writeback. */ xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) { if (folio_test_dirty(folio)) { long nr = folio_nr_pages(folio); wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); } } xas_set(&xas, 0); xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { long nr = folio_nr_pages(folio); WARN_ON_ONCE(!folio_test_writeback(folio)); wb_stat_mod(old_wb, WB_WRITEBACK, -nr); wb_stat_mod(new_wb, WB_WRITEBACK, nr); } if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { atomic_dec(&old_wb->writeback_inodes); atomic_inc(&new_wb->writeback_inodes); } wb_get(new_wb); /* * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. * The transfer preserves @inode->dirtied_when ordering. If the @inode * was clean, it means it was on the b_attached list, so move it onto * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { struct inode *pos; list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode_cgwb_move_to_attached(inode, new_wb); } } else { inode->i_wb = new_wb; } /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; switched = true; skip_switch: /* * Paired with load_acquire in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); return switched; } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; /* * If @inode switches cgwb membership while sync_inodes_sb() is * being issued, sync_inodes_sb() might miss it. Synchronize. */ down_read(&bdi->wb_switch_rwsem); /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be * synchronizing against the i_pages lock. * * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&new_wb->list_lock); spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } for (inodep = isw->inodes; *inodep; inodep++) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; } spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); if (nr_switched) { wb_wakeup(new_wb); wb_put_many(old_wb, nr_switched); } for (inodep = isw->inodes; *inodep; inodep++) iput(*inodep); wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); } static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { /* * Paired with smp_mb() in cgroup_writeback_umount(). * isw_nr_in_flight must be increased before checking SB_ACTIVE and * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 * in cgroup_writeback_umount() and the isw_wq will be not flushed. */ smp_mb(); if (IS_DAX(inode)) return false; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } inode->i_state |= I_WB_SWITCH; __iget(inode); spin_unlock(&inode->i_lock); return true; } /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode * @new_wb_id: ID of the new wb * * Switch @inode's wb association to the wb identified by @new_wb_id. The * switching is performed asynchronously and may fail silently. */ static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC); if (!isw) return; atomic_inc(&isw_nr_in_flight); /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) goto out_free; isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); if (!isw->new_wb) goto out_free; if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return; out_free: atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { if (!inode_prepare_wbs_switch(inode, isw->new_wb)) continue; isw->inodes[*nr] = inode; (*nr)++; if (*nr >= WB_MAX_INODES_PER_ISW - 1) return true; } return false; } /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb * * Switch all inodes attached to @wb to a nearest living ancestor's wb in order * to eventually release the dying @wb. Returns %true if not all inodes were * switched and the function has to be restarted. */ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; int nr; bool restart = false; isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW), GFP_KERNEL); if (!isw) return restart; atomic_inc(&isw_nr_in_flight); for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); if (isw->new_wb) break; } if (unlikely(!isw->new_wb)) isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); /* * In addition to the inodes that have completed writeback, also switch * cgwbs for those inodes only with dirty timestamps. Otherwise, those * inodes won't be written back for a long time when lazytime is * enabled, and thus pinning the dying cgwbs. It won't break the * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); if (!restart) restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); wb_put(isw->new_wb); kfree(isw); return restart; } /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return restart; } /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode * * @inode is locked and about to be written back under the control of @wbc. * Record @inode's writeback context into @wbc and unlock the i_lock. On * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); return; } wbc->wb = inode_to_wb(inode); wbc->inode = inode; wbc->wb_id = wbc->wb->memcg_css->id; wbc->wb_lcand_id = inode->i_wb_frn_winner; wbc->wb_tcand_id = 0; wbc->wb_bytes = 0; wbc->wb_lcand_bytes = 0; wbc->wb_tcand_bytes = 0; wb_get(wbc->wb); spin_unlock(&inode->i_lock); /* * A dying wb indicates that either the blkcg associated with the * memcg changed or the associated memcg is dying. In the first * case, a replacement wb should already be available and we should * refresh the wb immediately. In the second case, trying to * refresh will keep failing. */ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. * * As concurrent write sharing of an inode is expected to be very rare and * memcg only tracks page ownership on first-use basis severely confining * the usefulness of such sharing, cgroup writeback tracks ownership * per-inode. While the support for concurrent write sharing of an inode * is deemed unnecessary, an inode being written to by different cgroups at * different points in time is a lot more common, and, more importantly, * charging only by first-use can too readily lead to grossly incorrect * behaviors (single foreign page can lead to gigabytes of writeback to be * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of * an inode and transfers the ownership to it. To avoid unnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. * * On each writeback attempt, @wbc tries to detect the majority writer * using Boyer-Moore majority vote algorithm. In addition to the byte * count from the majority voting, it also counts the bytes written for the * current wb and the last round's winner wb (max of last round's current * wb, the winner from two rounds ago, and the last round's majority * candidate). Keeping track of the historical winner helps the algorithm * to semi-reliably detect the most active writer even when it's not the * absolute majority. * * Once the winner of the round is determined, whether the winner is * foreign or not and how much IO time the round consumed is recorded in * inode->i_wb_frn_history. If the amount of recorded foreign IO time is * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { struct bdi_writeback *wb = wbc->wb; struct inode *inode = wbc->inode; unsigned long avg_time, max_bytes, max_time; u16 history; int max_id; if (!wb) return; history = inode->i_wb_frn_history; avg_time = inode->i_wb_frn_avg_time; /* pick the winner of this round */ if (wbc->wb_bytes >= wbc->wb_lcand_bytes && wbc->wb_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_id; max_bytes = wbc->wb_bytes; } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_lcand_id; max_bytes = wbc->wb_lcand_bytes; } else { max_id = wbc->wb_tcand_id; max_bytes = wbc->wb_tcand_bytes; } /* * Calculate the amount of IO time the winner consumed and fold it * into the running average kept per inode. If the consumed IO * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for * deciding whether to switch or not. This is to prevent one-off * small dirtiers from skewing the verdict. */ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, wb->avg_write_bandwidth); if (avg_time) avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - (avg_time >> WB_FRN_TIME_AVG_SHIFT); else avg_time = max_time; /* immediate catch up on first run */ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { int slots; /* * The switch verdict is reached if foreign wb's consume * more than a certain proportion of IO time in a * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot * history mask where each bit represents one sixteenth of * the period. Determine the number of slots to shift into * history from @max_time. */ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), (unsigned long)WB_FRN_HIST_MAX_SLOTS); history <<= slots; if (wbc->wb_id != max_id) history |= (1U << slots) - 1; if (history) trace_inode_foreign_history(inode, wbc, history); /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the * inode may switch across them repeatedly over time, which * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } /* * Multiple instances of this function may race to update the * following fields but we don't mind occassional inaccuracies. */ inode->i_wb_frn_winner = max_id; inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); inode->i_wb_frn_history = history; wb_put(wbc->wb); wbc->wb = NULL; } EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out * * @bytes from @page are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { struct folio *folio; struct cgroup_subsys_state *css; int id; /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (!wbc->wb || wbc->no_cgroup_owner) return; folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; return; } if (id == wbc->wb_lcand_id) wbc->wb_lcand_bytes += bytes; /* Boyer-Moore majority vote algorithm */ if (!wbc->wb_tcand_bytes) wbc->wb_tcand_id = id; if (id == wbc->wb_tcand_id) wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi * * Split @wb's portion of @nr_pages according to @wb's write bandwidth in * relation to the total write bandwidth of all wb's w/ dirty inodes on * @wb->bdi. */ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); if (nr_pages == LONG_MAX) return LONG_MAX; /* * This may be called on clean wb's and proportional distribution * may not make sense, just use the original @nr_pages in those * cases. In general, we wanna err on the side of writing more. */ if (!tot_bw || this_bw >= tot_bw) return nr_pages; else return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue * @skip_if_busy: skip wb's which already have writeback in progress * * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's * distributed to the busy wbs according to each wb's proportion in the * total active write bandwidth of @bdi. */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; struct bdi_writeback *wb = list_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION(fallback_work_done, bdi); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; if (last_wb) { wb_put(last_wb); last_wb = NULL; } /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || list_empty(&wb->b_dirty_time))) continue; if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 1; wb_queue_work(wb, work); continue; } /* * If wb_tryget fails, the wb has been shutdown, skip it. * * Pin @wb so that it stays on @bdi->wb_list. This allows * continuing iteration from @wb after dropping and * regrabbing rcu read lock. */ if (!wb_tryget(wb)) continue; /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 0; work->done = &fallback_work_done; wb_queue_work(wb, work); last_wb = wb; rcu_read_unlock(); wb_wait_for_completion(&fallback_work_done); goto restart; } rcu_read_unlock(); if (last_wb) wb_put(last_wb); } /** * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; unsigned long dirty; int ret; /* lookup bdi and memcg */ bdi = bdi_get_by_id(bdi_id); if (!bdi) return -ENOENT; rcu_read_lock(); memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) { ret = -ENOENT; goto out_bdi_put; } /* * And find the associated wb. If the wb isn't there already * there's nothing to flush, don't create one. */ wb = wb_get_lookup(bdi, memcg_css); if (!wb) { ret = -ENOENT; goto out_css_put; } /* * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. * * BTW the memcg stats are flushed periodically and this is best-effort * estimation, so some potential error is ok. */ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; work->done = done; work->auto_free = 1; wb_queue_work(wb, work); ret = 0; } else { ret = -ENOMEM; } wb_put(wb); out_css_put: css_put(memcg_css); out_bdi_put: bdi_put(bdi); return ret; } /** * cgroup_writeback_umount - flush inode wb switches for umount * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through * RCU and then workqueue, so the two need to be flushed in order to ensure * that all previously scheduled switches are finished. As wb switches are * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ void cgroup_writeback_umount(void) { /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). */ smp_mb(); if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to * ensure that all in-flight wb switches are in the workqueue. */ rcu_barrier(); flush_workqueue(isw_wq); } } static int __init cgroup_writeback_init(void) { isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); if (!isw_wq) return -ENOMEM; return 0; } fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); return wb; } static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_lock(&wb->list_lock); return wb; } static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { return nr_pages; } static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { might_sleep(); if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; wb_queue_work(&bdi->wb, base_work); } } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * Add in the number of potentially dirty inodes, because each inode * write can dirty pagecache in the underlying blockdev. */ static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + get_nr_dirty_inodes(); } static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) { if (!wb_has_dirty_io(wb)) return; /* * All callers of this function want to start writeback of all * dirty pages. Places like vmscan can call this at a very * high frequency, causing pointless allocations of tons of * work items and keeping the flusher threads busy retrieving * that work. Ensure that we only allow one of them pending and * inflight at the time. */ if (test_bit(WB_start_all, &wb->state) || test_and_set_bit(WB_start_all, &wb->state)) return; wb->start_all_reason = reason; wb_wakeup(wb); } /** * wb_start_background_writeback - start background writeback * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(wb); wb_wakeup(wb); } /* * Remove the inode from the writeback list it is on. */ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb */ void sb_mark_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (list_empty(&inode->i_wb_list)) { list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); trace_sb_mark_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * clear an inode as under writeback on the sb */ void sb_clear_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (!list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (!list_empty(&inode->i_wb_list)) { list_del_init(&inode->i_wb_list); trace_sb_clear_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; /* * When the inode is being freed just don't bother with dirty list * tracking. Flush worker will ignore this inode anyway and it will * trigger assertions in inode_io_list_move_locked(). */ if (inode->i_state & I_FREEING) { list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); return; } if (!list_empty(&wb->b_dirty)) { struct inode *tail; tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { spin_lock(&inode->i_lock); redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); } /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) { bool ret = time_after(inode->dirtied_when, t); #ifndef CONFIG_64BIT /* * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif return ret; } /* * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long dirtied_before) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); goto out; } /* * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', * we don't take inode->i_lock here because it is just a pointless overhead. * Inode is already marked as I_SYNC_QUEUED so writeback list handling is * fully under our control. */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) list_move(&inode->i_io_list, dispatch_queue); } } out: return moved; } /* * Queue all expired dirty inodes for io, eldest first. * Before * newly dirtied b_dirty b_io b_more_io * =============> gf edc BA * After * newly dirtied b_dirty b_io b_more_io * =============> g fBAedc * | * +--> dequeue for IO */ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before) { int moved; unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); if (!work->for_sync) time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, time_expire_jif); if (moved) wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { trace_writeback_write_inode_start(inode, wbc); ret = inode->i_sb->s_op->write_inode(inode, wbc); trace_writeback_write_inode(inode, wbc); return ret; } return 0; } /* * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ static void __inode_wait_for_writeback(struct inode *inode) __releases(inode->i_lock) __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } /* * Wait for writeback on an inode to complete. Caller must have inode pinned. */ void inode_wait_for_writeback(struct inode *inode) { spin_lock(&inode->i_lock); __inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { DEFINE_WAIT(wait); wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); int sleep; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); sleep = inode->i_state & I_SYNC; spin_unlock(&inode->i_lock); if (sleep) schedule(); finish_wait(wqh, &wait); } /* * Find proper writeback list for the inode depending on its current state and * possibly also change of its state while we were doing writeback. Here we * handle things such as livelock prevention or fairness of writeback among * inodes. This function can be called only by flusher thread - noone else * processes all inodes in writeback lists and requeueing inodes behind flusher * thread's back can have unexpected consequences. */ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc, unsigned long dirtied_before) { if (inode->i_state & I_FREEING) return; /* * Sync livelock prevention. Each inode is tagged and synced in one * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ if ((inode->i_state & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; if (wbc->pages_skipped) { /* * Writeback is not making progress due to locked buffers. * Skip this inode for now. Although having skipped pages * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ if (inode->i_state & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); return; } if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. */ if (wbc->nr_to_write <= 0 && !inode_dirtied_after(inode, dirtied_before)) { /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); } else { /* * Writeback blocked by something other than * congestion. Delay the inode for some time to * avoid spinning on the CPU (100% iowait) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); } } /* * Write out an inode and its dirty pages (or some of its dirty pages, depending * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. * * This doesn't remove the inode from the writeback list it is on, except * potentially to move it from b_dirty_time to b_dirty due to timestamp * expiration. The caller is otherwise responsible for writeback list handling. * * The caller is also responsible for setting the I_SYNC flag beforehand and * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc); /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } /* * If the inode has dirty timestamps and we need to write them, call * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ if ((inode->i_state & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { trace_writeback_lazytime(inode); mark_inode_dirty_sync(inode); } /* * Get and clear the dirty flags from i_state. This needs to be done * after calling writepages because some filesystems may redirty the * inode during writepages due to delalloc. It also needs to be done * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows * __mark_inode_dirty() to test i_state without grabbing i_lock - * either they see the I_DIRTY bits cleared or we see the dirtied * inode. * * I_DIRTY_PAGES is always cleared together above even if @mapping * still has dirty pages. The flag is reinstated after smp_mb() if * necessary. This guarantees that either __mark_inode_dirty() * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. */ smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { inode->i_state &= ~I_PINNING_NETFS_WB; wbc->unpinned_netfs_wb = true; dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } /* * Write out an inode's dirty data and metadata on-demand, i.e. separately from * the regular batched writeback done by the flusher threads in * writeback_sb_inodes(). @wbc controls various aspects of the write, such as * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * * To prevent the inode from going away, either the caller must have a reference * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we * must wait for the existing writeback to complete, then do * writeback again if there's anything left. */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * * For data-integrity syncs we also need to check whether any pages are * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ if (!(inode->i_state & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); else if (!(inode->i_state & I_SYNC_QUEUED)) { if ((inode->i_state & I_DIRTY)) redirty_tail_locked(inode, wb); else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } } } spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); return ret; } static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; /* * WB_SYNC_ALL mode does livelock avoidance by syncing dirty * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX * here avoids calling into writeback_inodes_wb() more than once. * * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } return pages; } /* * Write a portion of b_io inodes which belong to @sb. * * Return the number of pages and/or inodes written. * * NOTE! This is called with wb->list_lock held, and will * unlock and relock that for each inode it ends up doing * IO for. */ static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, struct wb_writeback_work *work) { struct writeback_control wbc = { .sync_mode = work->sync_mode, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; long write_chunk; long total_wrote = 0; /* count both pages and inodes */ unsigned long dirtied_before = jiffies; if (work->for_kupdate) dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; long wrote; if (inode->i_sb != sb) { if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ redirty_tail(inode, wb); continue; } /* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */ break; } /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to * b_more_io so that writeback can proceed with the * other inodes on s_io. * * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ requeue_io(inode, wb); spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } spin_unlock(&wb->list_lock); /* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ if (inode->i_state & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; /* * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ __writeback_single_inode(inode, &wbc); wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; wrote = wrote < 0 ? 0 : wrote; total_wrote += wrote; if (need_resched()) { /* * We're trying to balance between building up a nice * long list of IOs to improve our merge rate, and * getting those IOs out quickly for anyone throttling * in balance_dirty_pages(). cond_resched() doesn't * unplug, so get our IOs out the door before we * give up the CPU. */ blk_flush_plug(current->plug, false); cond_resched(); } /* * Requeue @inode if still dirty. Be careful as @inode may * have been switched to another wb in the meantime. */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); spin_unlock(&inode->i_lock); if (unlikely(tmp_wb != wb)) { spin_unlock(&tmp_wb->list_lock); spin_lock(&wb->list_lock); } /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) { unsigned long start_time = jiffies; long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!super_trylock_shared(sb)) { /* * super_trylock_shared() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ redirty_tail(inode, wb); continue; } wrote += writeback_sb_inodes(sb, wb, work); up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } /* Leave any unwritten inodes on b_io */ return wrote; } static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, }; struct blk_plug plug; blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work.nr_pages; } /* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the * dirtying-time in the inode's address_space. So this periodic writeback code * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * * Try to run once per dirty_writeback_interval. But if a writeback event * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; bool queued = false; blk_start_plug(&plug); for (;;) { /* * Stop writeback when nr_pages has been consumed */ if (work->nr_pages <= 0) break; /* * Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ if (work->for_background && !wb_over_bg_thresh(wb)) break; spin_lock(&wb->list_lock); trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) { /* * Kupdate and background works are special and we want * to include all inodes that need writing. Livelock * avoidance is handled by these works yielding to any * other work so we are safe. */ if (work->for_kupdate) { dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) dirtied_before = jiffies; queue_io(wb, work, dirtied_before); queued = true; } if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); /* * Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ if (progress || !queued) { spin_unlock(&wb->list_lock); continue; } /* * No more inodes for IO, bail */ if (list_empty(&wb->b_more_io)) { spin_unlock(&wb->list_lock); break; } /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); } blk_finish_plug(&plug); return nr_pages - work->nr_pages; } /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; spin_lock_irq(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } spin_unlock_irq(&wb->work_lock); return work; } static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; long nr_pages; /* * When set to zero, disable periodic writeback */ if (!dirty_writeback_interval) return 0; expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) return 0; wb->last_old_flush = jiffies; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_start_all(struct bdi_writeback *wb) { long nr_pages; if (!test_bit(WB_start_all, &wb->state)) return 0; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = wb_split_bdi_pages(wb, nr_pages), .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = wb->start_all_reason, }; nr_pages = wb_writeback(wb, &work); } clear_bit(WB_start_all, &wb->state); return nr_pages; } /* * Retrieve work items and do the writeback they describe */ static long wb_do_writeback(struct bdi_writeback *wb) { struct wb_writeback_work *work; long wrote = 0; set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); finish_writeback_work(work); } /* * Check for a flush-everything request */ wrote += wb_check_start_all(wb); /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); clear_bit(WB_writeback_running, &wb->state); return wrote; } /* * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { /* * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } if (!list_empty(&wb->work_list)) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); } /* * Start writeback of all dirty pages on this bdi. */ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { struct bdi_writeback *wb; if (!bdi_has_dirty_io(bdi)) return; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, reason); } void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { rcu_read_lock(); __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; /* * If we are expecting writeback progress we must submit plugged IO. */ blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wake up bdi's periodically to make sure dirtytime inodes gets * written back periodically. We deliberately do *not* check the * b_dirtytime list in wb_has_dirty_io(), since this would cause the * kernel to be constantly waking up once there are any dirtytime * inodes on the system. So instead we define a separate delayed work * function which gets called much more rarely. (By default, only * once every 12 hours.) * * If there is any other write activity going on in the file system, * this function won't be necessary. But if the only thing that has * happened on the file system is a dirtytime inode caused by an atime * update, we need this infrastructure below to make sure that inode * eventually gets pushed out to disk. */ static void wakeup_dirtytime_writeback(struct work_struct *w); static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); static void wakeup_dirtytime_writeback(struct work_struct *w) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) if (!list_empty(&wb->b_dirty_time)) wb_wakeup(wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } static int __init start_dirtytime_writeback(void) { schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); return 0; } __initcall(start_dirtytime_writeback); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) mod_delayed_work(system_wq, &dirtytime_work, 0); return ret; } /** * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined * with I_DIRTY_PAGES. * * Mark an inode as dirty. We notify the filesystem, then update the inode's * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() * instead of calling this directly. * * CAREFUL! We only add the inode to the dirty list if it is hashed or if it * refers to a blockdev. Unhashed inodes will never be added to the dirty list * even if they are later hashed, as they will have been marked dirty already. * * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of * the kernel-internal blockdev inode represents the dirtying time of the * blockdev's pages. This is why for I_DIRTY_PAGES we always use * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); if (flags & I_DIRTY_INODE) { /* * Inode timestamp update will piggback on this dirtying. * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); if (inode->i_state & I_DIRTY_TIME) { inode->i_state &= ~I_DIRTY_TIME; flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); } /* * Notify the filesystem about the inode being dirtied, so that * (if needed) it can update on-disk fields and journal the * inode. This is only needed when the inode itself is being * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not * for just I_DIRTY_PAGES or I_DIRTY_TIME. */ trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; } else { /* * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME * in one call to __mark_inode_dirty().) */ dirtytime = flags & I_DIRTY_TIME; WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); } /* * Paired with smp_mb() in __writeback_single_inode() for the * following lockless i_state test. See there for details. */ smp_mb(); if ((inode->i_state & flags) == flags) return; spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); inode->i_state |= flags; /* * Grab inode's wb early because it requires dropping i_lock and we * need to make sure following checks happen atomically with dirty * list handling so that we don't move inodes under flush worker's * hands. */ if (!was_dirty) { wb = locked_inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); } /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with * the inode it will place it on the appropriate superblock * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) goto out_unlock; } if (inode->i_state & I_FREEING) goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { struct list_head *dirty_list; bool wakeup_bdi = false; inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* * If this is the first dirty inode for this bdi, * we have to wake-up the corresponding bdi thread * to make sure background write-back happens * later. */ if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } } out_unlock: if (wb) spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); /* * The @s_sync_lock is used to serialise concurrent sync operations * to avoid lock contention problems with concurrent wait_sb_inodes() calls. * Concurrent callers will block on the s_sync_lock rather than doing contending * walks. The queueing maintains sync(2) required behaviour as all the IO that * has been issued up to the time this function is enter is guaranteed to be * completed by the time we have gained the lock and waited for all IO that is * in progress regardless of the order callers are granted the lock. */ static void wait_sb_inodes(struct super_block *sb) { LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); /* * Splice the writeback list onto a temporary list to avoid waiting on * inodes that have started writeback after this point. * * Use rcu_read_lock() to keep the inodes around until we have a * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as * the local list because inodes can be dropped from either by writeback * completion. */ rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); list_splice_init(&sb->s_inodes_wb, &sync_list); /* * Data integrity sync. Must wait for all pages under writeback, because * there may have been pages dirtied before our sync call, but which had * writeout started before we write it out. In which case, the inode * may not be on the dirty list, but we still have to wait for that * writeout. */ while (!list_empty(&sync_list)) { struct inode *inode = list_first_entry(&sync_list, struct inode, i_wb_list); struct address_space *mapping = inode->i_mapping; /* * Move each inode back to the wb list before we drop the lock * to preserve consistency between i_wb_list and the mapping * writeback tag. Writeback completion is responsible to remove * the inode from either list once the writeback tag is cleared. */ list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); /* * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. */ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); /* * We keep the error status of individual mapping so that * applications can catch the writeback error using fsync(2). * See filemap_fdatawait_keep_errors() for details. */ filemap_fdatawait_keep_errors(mapping); cond_resched(); iput(inode); rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); } spin_unlock_irq(&sb->s_inode_wblist_lock); rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .tagged_writepages = 1, .done = &done, .nr_pages = nr, .reason = reason, }; if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); wb_wait_for_completion(&done); } /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write * @reason: reason why some writeback work initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock * @reason: reason why some writeback work was initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); /** * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock * @reason: reason why some writeback work was initiated * * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) return; __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this * super_block. */ void sync_inodes_sb(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, .for_sync = 1, }; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty * inodes under writeback and I_DIRTY_TIME inodes ignored by * bdi_has_dirty() need to be written out too. */ if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(&done); bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk * @inode: inode to write to disk * @sync: whether the write should be synchronous or not * * This function commits an inode to disk immediately if it is dirty. This is * primarily needed by knfsd. * * The caller must either have a ref on the inode or must have set I_WILL_FREE. */ int write_inode_now(struct inode *inode, int sync) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); /** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ int sync_inode_metadata(struct inode *inode, int wait) { struct writeback_control wbc = { .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .nr_to_write = 0, /* metadata-only */ }; return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata);
3 10 10 18 18 18 18 18 17 18 15 8 4 7 1 1 1 1 1 5 7 10 7 4 10 10 3 4 4 10 4 6 10 10 4 6 1 1 4 4 4 4 17 17 5 17 17 15 16 15 16 13 13 17 12 6 12 9 12 12 11 11 7 10 10 10 10 10 10 10 10 10 2 8 2 10 10 4 2 12 25 25 15 4 9 9 5 4 9 9 9 9 1 1 1 1 5 8 5 5 5 3 8 3 3 2 3 3 3 3 3 2 2 2 2 2 2 7 7 7 7 7 7 6 3 4 4 2 2 4 4 4 1 1 3 1 2 4 4 7 11 4 3 3 3 2 3 2 3 1 3 3 2 2 4 5 5 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 2 7 6 6 6 7 6 3 6 3 3 6 6 6 6 3 4 4 4 1 1 1 4 6 6 6 3 3 3 6 6 3 6 1 1 1 1 1 1 1 1 2 2 2 2 1 2 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 4 3 3 4 9 9 7 2 9 2 2 1 1 1 1 1 3 1 2 2 2 3 2 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/rcupdate_wait.h> #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) #define BATCH_OPS(_name) \ .map_lookup_batch = \ _name##_map_lookup_batch, \ .map_lookup_and_delete_batch = \ _name##_map_lookup_and_delete_batch, \ .map_update_batch = \ generic_map_update_batch, \ .map_delete_batch = \ generic_map_delete_batch /* * The bucket lock has two protection scopes: * * 1) Serializing concurrent operations from BPF programs on different * CPUs * * 2) Serializing concurrent operations from BPF programs and sys_bpf() * * BPF programs can execute in any context including perf, kprobes and * tracing. As there are almost no limits where perf, kprobes and tracing * can be invoked from the lock operations need to be protected against * deadlocks. Deadlocks can be caused by recursion and by an invocation in * the lock held section when functions which acquire this lock are invoked * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU * variable bpf_prog_active, which prevents BPF programs attached to perf * events, kprobes and tracing to be invoked before the prior invocation * from one of these contexts completed. sys_bpf() uses the same mechanism * by pinning the task to the current CPU and incrementing the recursion * protection across the map operation. * * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain * operations like memory allocations (even with GFP_ATOMIC) from atomic * contexts. This is required because even with GFP_ATOMIC the memory * allocator calls into code paths which acquire locks with long held lock * sections. To ensure the deterministic behaviour these locks are regular * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only * true atomic contexts on an RT kernel are the low level hardware * handling, scheduling, low level interrupt handling, NMIs etc. None of * these contexts should ever do memory allocations. * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, * because there is no memory allocation within the lock held sections. However * after hash map was fully converted to use bpf_mem_alloc, there will be * non-synchronous memory allocation for non-preallocated hash map, so it is * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; raw_spinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { struct pcpu_freelist freelist; struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; /* number of elements in non-preallocated hashtable are kept * in either pcount or count */ struct percpu_counter pcount; atomic_t count; bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; struct lock_class_key lockdep_key; int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { struct hlist_nulls_node hash_node; struct { void *padding; union { struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { /* pointer to per-cpu pointer */ void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; char key[] __aligned(8); }; static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].raw_lock); lockdep_set_class(&htab->buckets[i].raw_lock, &htab->lockdep_key); cond_resched(); } } static inline int htab_lock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long *pflags) { unsigned long flags; hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); preempt_disable(); local_irq_save(flags); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); local_irq_restore(flags); preempt_enable(); return -EBUSY; } raw_spin_lock(&b->raw_lock); *pflags = flags; return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); raw_spin_unlock(&b->raw_lock); __this_cpu_dec(*(htab->map_locked[hash])); local_irq_restore(flags); preempt_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static bool htab_is_percpu(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { *(void __percpu **)(l->key + key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { return *(void __percpu **)(l->key + key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { return *(void **)(l->key + roundup(map->key_size, 8)); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } static bool htab_has_extra_elems(struct bpf_htab *htab) { return !htab_is_percpu(htab) && !htab_is_lru(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; for_each_possible_cpu(cpu) { bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } else { bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } cond_resched(); } } static void htab_free_elems(struct bpf_htab *htab) { int i; if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { void __percpu *pptr; pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); cond_resched(); } free_elems: bpf_map_area_free(htab->elems); } /* The LRU list has a lock (lru_lock). Each htab bucket has a lock * (bucket_lock). If both locks need to be acquired together, the lock * order is always lru_lock -> bucket_lock and this only happens in * bpf_lru_list.c logic. For example, certain code path of * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), * will acquire lru_lock first followed by acquiring bucket_lock. * * In hashtab.c, to avoid deadlock, lock acquisition of * bucket_lock followed by lru_lock is not allowed. In such cases, * bucket_lock needs to be released first before acquiring lru_lock. */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); struct htab_elem *l; if (node) { bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); memcpy(l->key, key, htab->map.key_size); return l; } return NULL; } static int prealloc_init(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; pptr = bpf_map_alloc_percpu(&htab->map, size, 8, GFP_USER | __GFP_NOWARN); if (!pptr) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); cond_resched(); } skip_percpu_elems: if (htab_is_lru(htab)) err = bpf_lru_init(&htab->lru, htab->map.map_flags & BPF_F_NO_COMMON_LRU, offsetof(struct htab_elem, hash) - offsetof(struct htab_elem, lru_node), htab_lru_map_delete_node, htab); else err = pcpu_freelist_init(&htab->freelist); if (err) goto free_elems; if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), htab->elem_size, num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), htab->elem_size, num_entries); return 0; free_elems: htab_free_elems(htab); return err; } static void prealloc_destroy(struct bpf_htab *htab) { htab_free_elems(htab); if (htab_is_lru(htab)) bpf_lru_destroy(&htab->lru); else pcpu_freelist_destroy(&htab->freelist); } static int alloc_extra_elems(struct bpf_htab *htab) { struct htab_elem *__percpu *pptr, *l_new; struct pcpu_freelist_node *l; int cpu; pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { l = pcpu_freelist_pop(&htab->freelist); /* pop will succeed, since prealloc_init() * preallocated extra num_possible_cpus elements */ l_new = container_of(l, struct htab_elem, fnode); *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; } /* Called from syscall */ static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value. */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) return -EINVAL; if (lru && !prealloc) return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return -EINVAL; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set */ if (attr->max_entries == 0 || attr->key_size == 0 || attr->value_size == 0) return -EINVAL; if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - sizeof(struct htab_elem)) /* if key_size + value_size is bigger, the user space won't be * able to access the elements via bpf syscall. This check * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; return 0; } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value. */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); lockdep_register_key(&htab->lockdep_key); bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same * number of elements. */ htab->map.max_entries = roundup(attr->max_entries, num_possible_cpus()); if (htab->map.max_entries < attr->max_entries) htab->map.max_entries = rounddown(attr->max_entries, num_possible_cpus()); } /* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first */ err = -E2BIG; if (htab->map.max_entries > 1UL << 31) goto free_htab; htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) htab->elem_size += sizeof(void *); else htab->elem_size += round_up(htab->map.value_size, 8); /* check for u32 overflow */ if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; err = bpf_map_init_elem_count(&htab->map); if (err) goto free_htab; err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) goto free_elem_count; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, sizeof(int), sizeof(int), GFP_USER); if (!htab->map_locked[i]) goto free_map_locked; } if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else htab->hashrnd = get_random_u32(); htab_init_buckets(htab); /* compute_batch_value() computes batch value as num_online_cpus() * 2 * and __percpu_counter_compare() needs * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() * for percpu_counter to be faster than atomic_t. In practice the average bpf * hash map size is 10k, which means that a system with 64 cpus will fill * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore * define our own batch count as 32 then 10k hash map can be filled up to 80%: * 10k - 8k > 32 _batch_ * 64 _cpus_ * and __percpu_counter_compare() will still be fast. At that point hash map * collisions will dominate its performance anyway. Assume that hash map filled * to 50+% isn't going to be O(1) and use the following formula to choose * between percpu_counter and atomic_t. */ #define PERCPU_COUNTER_BATCH 32 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) htab->use_percpu_counter = true; if (htab->use_percpu_counter) { err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); if (err) goto free_map_locked; } if (prealloc) { err = prealloc_init(htab); if (err) goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so * there is no need for an extra elem during map_update. */ err = alloc_extra_elems(htab); if (err) goto free_prealloc; } } else { err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); if (err) goto free_map_locked; if (percpu) { err = bpf_mem_alloc_init(&htab->pcpu_ma, round_up(htab->map.value_size, 8), true); if (err) goto free_map_locked; } } return &htab->map; free_prealloc: prealloc_destroy(htab); free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); return ERR_PTR(err); } static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { if (likely(key_len % 4 == 0)) return jhash2(key, key_len / 4, hashrnd); return jhash(key, key_len, hashrnd); } static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } /* this lookup function can only be called with bucket lock taken */ static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; return NULL; } /* can be called without bucket lock. it will repeat the loop in * the unlikely event when elements moved from one bucket into another * while link list is being walked */ static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size, u32 n_buckets) { struct hlist_nulls_node *n; struct htab_elem *l; again: hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) goto again; return NULL; } /* Called from syscall or from eBPF program directly, so * arguments have to match bpf_map_lookup_elem() exactly. * The return value is adjusted by BPF instructions * in htab_map_gen_lookup(). */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } static void *htab_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) return l->key + round_up(map->key_size, 8); return NULL; } /* inline bpf_map_lookup_elem() call. * Instead of: * bpf_prog * bpf_map_lookup_elem * map->ops->map_lookup_elem * htab_map_lookup_elem * __htab_map_lookup_elem * do: * bpf_prog * __htab_map_lookup_elem */ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); return insn - insn_buf; } static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) { return __htab_lru_map_lookup_elem(map, key, true); } static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) { return __htab_lru_map_lookup_elem(map, key, false); } static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int ref_reg = BPF_REG_1; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref)); *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); *insn++ = BPF_ST_MEM(BPF_B, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref), 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); return insn - insn_buf; } static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { void *map_value = elem->key + round_up(htab->map.key_size, 8); bpf_obj_free_fields(htab->map.record, map_value); } } /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); if (ret) return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); check_and_free_fields(htab, l); bpf_map_dec_elem_count(&htab->map); break; } htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; int i = 0; WARN_ON_ONCE(!rcu_read_lock_held()); key_size = map->key_size; if (!key) goto find_first_elem; hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); /* lookup the key */ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); if (!l) goto find_first_elem; /* key was found, get next key in the same bucket */ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { /* if next elem in this hash list is non-zero, just return it */ memcpy(next_key, next_l->key, key_size); return 0; } /* no more elements in this hash list, go to the next bucket */ i = hash & (htab->n_buckets - 1); i++; find_first_elem: /* iterate over buckets */ for (; i < htab->n_buckets; i++) { head = select_bucket(htab, i); /* pick first element in the bucket */ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ memcpy(next_key, next_l->key, key_size); return 0; } } /* iterated over all buckets and all elements */ return -ENOENT; } static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; void *ptr; if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(map, ptr, true); } } static bool is_map_full(struct bpf_htab *htab) { if (htab->use_percpu_counter) return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, PERCPU_COUNTER_BATCH) >= 0; return atomic_read(&htab->count) >= htab->map.max_entries; } static void inc_elem_count(struct bpf_htab *htab) { bpf_map_inc_elem_count(&htab->map); if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); else atomic_inc(&htab->count); } static void dec_elem_count(struct bpf_htab *htab) { bpf_map_dec_elem_count(&htab->map); if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); else atomic_dec(&htab->count); } static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); htab_elem_free(htab, l); } } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { if (!onallcpus) { /* copy true value_size bytes */ copy_map_value(&htab->map, this_cpu_ptr(pptr), value); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ if (!onallcpus) { int current_cpu = raw_smp_processor_id(); int cpu; for_each_possible_cpu(cpu) { if (cpu == current_cpu) copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); else /* Since elem is preallocated, we cannot touch special fields */ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { pcpu_copy_value(htab, pptr, value, onallcpus); } } static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; if (prealloc) { if (old_elem) { /* if we're updating the existing element, * use per-cpu extra elems to avoid freelist_pop/push */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); bpf_map_inc_elem_count(&htab->map); } } else { if (is_map_full(htab)) if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ return ERR_PTR(-E2BIG); inc_elem_count(htab); l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; } } memcpy(l_new->key, key, key_size); if (percpu) { if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } l_new->ptr_to_pptr = pptr; pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); } else { copy_map_value(&htab->map, l_new->key + round_up(key_size, 8), value); } l_new->hash = hash; return l_new; dec_count: dec_elem_count(htab); return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; return 0; } /* Called from syscall or from eBPF program */ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); ret = check_flags(htab, l_old, map_flags); if (ret) return ret; if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false); return 0; } /* fall through, grab the bucket lock and lookup again. * 99.9% chance that the element won't be found, * but second lookup under lock has to be done. */ } ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) return ret; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { /* first lookup without the bucket lock didn't find the element, * but second lookup with the bucket lock found it. * This case is highly unlikely, but has to be dealt with: * grab the element lock in addition to the bucket lock * and update element in place */ copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false); ret = 0; goto err; } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); goto err; } /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); else check_and_free_fields(htab, l_old); } ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); return ret; } static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; /* For LRU, we need to alloc before taking bucket's * spinlock because getting free nodes from LRU may need * to remove older elements from htab and this removal * operation will need a bucket lock. */ l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; copy_map_value(&htab->map, l_new->key + round_up(map->key_size, 8), value); ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); err_lock_bucket: if (ret) htab_lru_push_free(htab, l_new); else if (l_old) htab_lru_push_free(htab, l_old); return ret; } static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) return ret; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (l_old) { /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); return ret; } static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; int ret; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; /* For LRU, we need to alloc before taking bucket's * spinlock because LRU's elem alloc may need * to remove older elem from htab and this removal * operation will need a bucket lock. */ if (map_flags != BPF_EXIST) { l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; } ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); ret = check_flags(htab, l_old, map_flags); if (ret) goto err; if (l_old) { bpf_lru_node_set_ref(&l_old->lru_node); /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &l_new->lru_node); } return ret; } static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, false); } /* Called from syscall or from eBPF program */ static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); } else { ret = -ENOENT; } htab_unlock_bucket(htab, b, hash, flags); return ret; } static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) hlist_nulls_del_rcu(&l->hash_node); else ret = -ENOENT; htab_unlock_bucket(htab, b, hash, flags); if (l) htab_lru_push_free(htab, l); return ret; } static void delete_all_elements(struct bpf_htab *htab) { int i; /* It's called from a worker thread, so disable migration here, * since bpf_mem_cache_free() relies on that. */ migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } cond_resched(); } migrate_enable(); } static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) htab_free_malloced_timers_and_wq(htab); else htab_free_prealloced_timers_and_wq(htab); } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. * There is no need to synchronize_rcu() here to protect map elements. */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { htab_free_prealloced_fields(htab); prealloc_destroy(htab); } bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; rcu_read_lock(); value = htab_map_lookup_elem(map, key); if (!value) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); rcu_read_unlock(); } static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, bool is_lru_map, bool is_percpu, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; unsigned long bflags; struct htab_elem *l; u32 hash, key_size; struct bucket *b; int ret; key_size = map->key_size; hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; ret = htab_lock_bucket(htab, b, hash, &bflags); if (ret) return ret; l = lookup_elem_raw(head, hash, key, key_size); if (!l) { ret = -ENOENT; } else { if (is_percpu) { u32 roundup_value_size = round_up(map->value_size, 8); void __percpu *pptr; int off = 0, cpu; pptr = htab_elem_get_ptr(l, key_size); for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(&htab->map, value + off); off += roundup_value_size; } } else { u32 roundup_key_size = round_up(map->key_size, 8); if (flags & BPF_F_LOCK) copy_map_value_locked(map, value, l->key + roundup_key_size, true); else copy_map_value(map, value, l->key + roundup_key_size); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); if (!is_lru_map) free_htab_elem(htab, l); } htab_unlock_bucket(htab, b, hash, bflags); if (is_lru_map && l) htab_lru_push_free(htab, l); return ret; } static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, false, false, flags); } static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, false, true, flags); } static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, true, false, flags); } static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return __htab_map_lookup_and_delete_elem(map, key, value, true, true, flags); } static int __htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr, bool do_delete, bool is_lru_map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; if (map_flags) return -EINVAL; max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; batch = 0; if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) return -EFAULT; if (batch >= htab->n_buckets) return -ENOENT; key_size = htab->map.key_size; roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; alloc: /* We cannot do copy_from_user or copy_to_user inside * the rcu_read_lock. Allocate enough space here. */ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); if (!keys || !values) { ret = -ENOMEM; goto after_loop; } again: bpf_disable_instrumentation(); rcu_read_lock(); again_nocopy: dst_key = keys; dst_val = values; b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; } } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; if (bucket_cnt && !locked) { locked = true; goto again_nocopy; } if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; } if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); kvfree(values); goto alloc; } /* Next block is only safe to run if you have grabbed the lock */ if (!locked) goto next_batch; hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); if (is_percpu) { int off = 0, cpu; void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(&htab->map, dst_val + off); off += size; } } else { value = l->key + roundup_key_size; if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); value = &map_id; } if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); else copy_map_value(map, dst_val, value); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, dst_val); } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); /* bpf_lru_push_free() will acquire lru_lock, which * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. */ if (is_lru_map) { l->batch_flink = node_to_free; node_to_free = l; } else { free_htab_elem(htab, l); } } dst_key += key_size; dst_val += value_size; } htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; htab_lru_push_free(htab, l); } next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { batch++; goto again_nocopy; } rcu_read_unlock(); bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, key_size * bucket_cnt) || copy_to_user(uvalues + total * value_size, values, value_size * bucket_cnt))) { ret = -EFAULT; goto after_loop; } total += bucket_cnt; batch++; if (batch >= htab->n_buckets) { ret = -ENOENT; goto after_loop; } goto again; after_loop: if (ret == -EFAULT) goto out; /* copy # of entries and next batch */ ubatch = u64_to_user_ptr(attr->batch.out_batch); if (copy_to_user(ubatch, &batch, sizeof(batch)) || put_user(total, &uattr->batch.count)) ret = -EFAULT; out: kvfree(keys); kvfree(values); return ret; } static int htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, false, true); } static int htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, false, true); } static int htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, false, false); } static int htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, false, false); } static int htab_lru_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, true, true); } static int htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, true, true); } static int htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, true, false); } static int htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, true, false); } struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash u32 bucket_id; u32 skip_elems; }; static struct htab_elem * bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; struct bucket *b; u32 i, count; if (bucket_id >= htab->n_buckets) return NULL; /* try to find next elem in the same bucket */ if (prev_elem) { /* no update/deletion on this bucket, prev_elem should be still valid * and we won't skip elements. */ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); if (elem) return elem; /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; return elem; } count++; } rcu_read_unlock(); skip_elems = 0; } info->bucket_id = i; info->skip_elems = 0; return NULL; } static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_hash_map_info *info = seq->private; struct htab_elem *elem; elem = bpf_hash_map_seq_find_next(info, NULL); if (!elem) return NULL; if (*pos == 0) ++*pos; return elem; } static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_hash_map_info *info = seq->private; ++*pos; ++info->skip_elems; return bpf_hash_map_seq_find_next(info, v); } static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; struct bpf_prog *prog; void __percpu *pptr; meta.seq = seq; prog = bpf_iter_get_info(&meta, elem == NULL); if (prog) { ctx.meta = &meta; ctx.map = info->map; if (elem) { roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { ctx.value = elem->key + roundup_key_size; } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, info->percpu_value_buf + off); off += roundup_value_size; } ctx.value = info->percpu_value_buf; } } ret = bpf_iter_run_prog(prog, &ctx); } return ret; } static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) { return __bpf_hash_map_seq_show(seq, v); } static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { buf_size = round_up(map->value_size, 8) * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; seq_info->percpu_value_buf = value_buf; } bpf_map_inc_with_uref(map); seq_info->map = map; seq_info->htab = container_of(map, struct bpf_htab, map); return 0; } static void bpf_iter_fini_hash_map(void *priv_data) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } static const struct seq_operations bpf_hash_map_seq_ops = { .start = bpf_hash_map_seq_start, .next = bpf_hash_map_seq_next, .stop = bpf_hash_map_seq_stop, .show = bpf_hash_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_hash_map_seq_ops, .init_seq_private = bpf_iter_init_hash_map, .fini_seq_private = bpf_iter_fini_hash_map, .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; void *key, *val; bool is_percpu; u64 ret = 0; if (flags != 0) return -EINVAL; is_percpu = htab_is_percpu(htab); roundup_key_size = round_up(map->key_size, 8); /* disable migration so percpu value prepared here will be the * same as the one seen by the bpf program with bpf_map_lookup_elem(). */ if (is_percpu) migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { val = elem->key + roundup_key_size; } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) { rcu_read_unlock(); goto out; } } rcu_read_unlock(); } out: if (is_percpu) migrate_enable(); return num_elems; } static u64 htab_map_mem_usage(const struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); u32 value_size = round_up(htab->map.value_size, 8); bool prealloc = htab_is_prealloc(htab); bool percpu = htab_is_percpu(htab); bool lru = htab_is_lru(htab); u64 num_entries; u64 usage = sizeof(struct bpf_htab); usage += sizeof(struct bucket) * htab->n_buckets; usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; if (prealloc) { num_entries = map->max_entries; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); usage += htab->elem_size * num_entries; if (percpu) usage += value_size * num_possible_cpus() * num_entries; else if (!lru) usage += sizeof(struct htab_elem *) * num_possible_cpus(); } else { #define LLIST_NODE_SZ sizeof(struct llist_node) num_entries = htab->use_percpu_counter ? percpu_counter_sum(&htab->pcount) : atomic_read(&htab->count); usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; if (percpu) { usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; usage += value_size * num_possible_cpus() * num_entries; } } return usage; } BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); else return NULL; } /* inline bpf_map_lookup_elem() call for per-CPU hashmap */ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; if (!bpf_jit_supports_percpu_insn()) return -EOPNOTSUPP; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, offsetof(struct htab_elem, key) + map->key_size); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); return insn - insn_buf; } static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; if (cpu >= nr_cpu_ids) return NULL; l = __htab_map_lookup_elem(map, key); if (l) return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); else return NULL; } static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { bpf_lru_node_set_ref(&l->lru_node); return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); } return NULL; } static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; if (cpu >= nr_cpu_ids) return NULL; l = __htab_map_lookup_elem(map, key); if (l) { bpf_lru_node_set_ref(&l->lru_node); return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); } return NULL; } int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; int cpu, off = 0; u32 size; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = round_up(map->value_size, 8); rcu_read_lock(); l = __htab_map_lookup_elem(map, key); if (!l) goto out; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } ret = 0; out: rcu_read_unlock(); return ret; } int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); if (htab_is_lru(htab)) ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); rcu_read_unlock(); return ret; } static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct htab_elem *l; void __percpu *pptr; int cpu; rcu_read_lock(); l = __htab_map_lookup_elem(map, key); if (!l) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); seq_puts(m, ": {\n"); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); seq_puts(m, "\n"); } seq_puts(m, "}\n"); rcu_read_unlock(); } const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) return -EINVAL; return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_node *n; struct hlist_nulls_head *head; struct htab_elem *l; int i; for (i = 0; i < htab->n_buckets; i++) { head = select_bucket(htab, i); hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(map, ptr, false); } } htab_map_free(map); } /* only called from syscall */ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **ptr; int ret = 0; if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; rcu_read_lock(); ptr = htab_map_lookup_elem(map, key); if (ptr) *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); else ret = -ENOENT; rcu_read_unlock(); return ret; } /* only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; u32 ufd = *(u32 *)value; ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid * the WARN_ON_ONCE in htab_map_update_elem(). */ rcu_read_lock(); ret = htab_map_update_elem(map, key, &ptr, map_flags); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); return ret; } static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) return inner_map_meta; map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; } map->inner_map_meta = inner_map_meta; return map; } static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = htab_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + round_up(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); return insn - insn_buf; } static void htab_of_map_free(struct bpf_map *map) { bpf_map_meta_free(map->inner_map_meta); fd_htab_map_free(map); } const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_of_map_lookup_elem, .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], };
18 15 18 18 17 18 18 18 18 18 18 1 1 1 1 7 7 6 5 1 1 6 15 15 15 13 13 15 15 15 15 15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include <asm/shmparam.h> #include "memmap.h" #include "kbuf.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { struct page *page; int i, order; order = get_order(size); if (order > MAX_PAGE_ORDER) return ERR_PTR(-ENOMEM); else if (order) gfp |= __GFP_COMP; page = alloc_pages(gfp, order); if (!page) return ERR_PTR(-ENOMEM); for (i = 0; i < nr_pages; i++) pages[i] = page + i; return page_address(page); } static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { void *ret; int i; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(gfp); if (!pages[i]) goto err; } ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (ret) return ret; err: while (i--) put_page(pages[i]); return ERR_PTR(-ENOMEM); } void *io_pages_map(struct page ***out_pages, unsigned short *npages, size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; struct page **pages; int nr_pages; void *ret; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); if (!pages) return ERR_PTR(-ENOMEM); ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) goto done; ret = io_mem_alloc_single(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) { done: *out_pages = pages; *npages = nr_pages; return ret; } kvfree(pages); *out_pages = NULL; *npages = 0; return ret; } void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, bool put_pages) { bool do_vunmap = false; if (!ptr) return; if (put_pages && *npages) { struct page **to_free = *pages; int i; /* * Only did vmap for the non-compound multiple page case. * For the compound page, we just need to put the head. */ if (PageCompound(to_free[0])) *npages = 1; else if (*npages > 1) do_vunmap = true; for (i = 0; i < *npages; i++) put_page(to_free[i]); } if (do_vunmap) vunmap(ptr); kvfree(*pages); *pages = NULL; *npages = 0; } void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; if (!page_array) return; unpin_user_pages(page_array, npages); kvfree(page_array); *pages = NULL; } struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages; int ret; end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) return ERR_PTR(-ENOMEM); ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); /* success, mapped all pages */ if (ret == nr_pages) { *npages = nr_pages; return pages; } /* partial map, or didn't map anything */ if (ret >= 0) { /* if we did partial map, release any pages we did get */ if (ret) unpin_user_pages(pages, ret); ret = -EFAULT; } kvfree(pages); return ERR_PTR(ret); } void *__io_uaddr_map(struct page ***pages, unsigned short *npages, unsigned long uaddr, size_t size) { struct page **page_array; unsigned int nr_pages; void *page_addr; *npages = 0; if (uaddr & (PAGE_SIZE - 1) || !size) return ERR_PTR(-EINVAL); nr_pages = 0; page_array = io_pin_pages(uaddr, size, &nr_pages); if (IS_ERR(page_array)) return page_array; page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); if (page_addr) { *pages = page_array; *npages = nr_pages; return page_addr; } io_pages_free(&page_array, nr_pages); return ERR_PTR(-ENOMEM); } static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); if (IS_ERR(bl)) return bl; ptr = bl->buf_ring; io_put_bl(ctx, bl); return ptr; } } return ERR_PTR(-EINVAL); } int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, struct page **pages, int npages) { unsigned long nr_pages = npages; vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); } #ifdef CONFIG_MMU __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned int npages; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); } return -EINVAL; } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; /* * Do not allow to map to user-provided address to avoid breaking the * aliasing rules. Userspace is not able to guess the offset address of * kernel kmalloc()ed memory area. */ if (addr) return -EINVAL; ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; /* * Some architectures have strong cache aliasing requirements. * For such architectures we need a coherent mapping which aliases * kernel memory *and* userspace memory. To achieve that: * - use a NULL file pointer to reference physical memory, and * - use the kernel virtual address of the shared io_uring context * (instead of the userspace-provided address, which has to be 0UL * anyway). * - use the same pgoff which the get_unmapped_area() uses to * calculate the page colouring. * For architectures without such aliasing requirements, the * architecture will return any suitable mapping because addr is 0. */ filp = NULL; flags |= MAP_SHARED; pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR addr = (uintptr_t) ptr; pgoff = addr >> PAGE_SHIFT; #else addr = 0UL; #endif return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } unsigned int io_uring_nommu_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; } unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); return (unsigned long) ptr; } #endif /* !CONFIG_MMU */
9 8 6 2 1 9 6 5 6 1 1 2 2 3 8 8 10 7 8 10 6 4 6 122 112 20 17 8 16 10 8 8 1 1 1 10 8 2 2 2 2 2 5 10 5 10 19 19 19 19 19 19 19 10 5 2 14 14 14 14 14 1 1 1 14 9 9 9 9 9 9 1 9 9 8 9 9 9 9 9 4 9 2 12 2 12 2 2 2 2 28 4 28 28 25 28 4 4 29 4 29 29 29 9 9 29 19 13 13 12 26 25 20 12 11 11 20 10 19 9 27 28 19 28 28 28 27 5 2 6 28 29 4 4 4 2 2 2 2 1 1 2 5 4 5 4 2 2 2 2 2 2 2 4 2 37 54 3 7 7 7 7 7 3 7 3 5 7 5 2 2 1 2 10 10 10 10 20 19 20 20 20 20 20 20 20 11 11 11 11 11 1 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET connection oriented protocols. * * Authors: See the TCP sources */ #include <linux/module.h> #include <linux/jhash.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses * if IPv6 only, and any IPv4 addresses * if not IPv6 only * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return true; return false; } #endif /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, inet6_rcv_saddr(sk2), sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk2), match_wildcard, match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); bool inet_rcv_saddr_any(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #endif return !sk->sk_rcv_saddr; } /** * inet_sk_get_local_port_range - fetch ephemeral ports range * @sk: socket * @low: pointer to low port * @high: pointer to high port * * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. */ bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { int lo, hi, sk_lo, sk_hi; bool local_range = false; u32 sk_range; inet_get_local_port_range(sock_net(sk), &lo, &hi); sk_range = READ_ONCE(inet_sk(sk)->local_port_range); if (unlikely(sk_range)) { sk_lo = sk_range & 0xffff; sk_hi = sk_range >> 16; if (lo <= sk_lo && sk_lo <= hi) lo = sk_lo; if (lo <= sk_hi && sk_hi <= hi) hi = sk_hi; local_range = true; } *low = lo; *high = hi; return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); if (addr_type == IPV6_ADDR_ANY) return false; if (addr_type != IPV6_ADDR_MAPPED) return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; if (sk == sk2) return false; bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); if (!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) { if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(sk_uid, sock_i_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(sk_uid, sock_i_uid(sk2)))) { return true; } } return false; } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (ipv6_only_sock(sk2)) { if (sk->sk_family == AF_INET) return false; #if IS_ENABLED(CONFIG_IPV6) if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif } return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } return false; } #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ sk_for_each_bound(sk2, &(__tb2)->owners) /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); struct sock_reuseport *reuseport_cb; bool reuseport_cb_ok; struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ if (inet_use_bhash2_on_bind(sk)) return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); /* Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ sk_for_each_bound_bhash(sk2, tb2, tb) { if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; if (inet_rcv_saddr_equal(sk, sk2, true)) return true; } return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or * INADDR_ANY (if ipv4) socket. * * Caller must hold bhash hashbucket lock with local bh disabled, to protect * against concurrent binds on the port for addr any */ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); spin_lock(&head2->lock); inet_bind_bucket_for_each(tb2, &head2->chain) { if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) continue; if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; conflict = true; break; } spin_unlock(&head2->lock); return conflict; } /* * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) goto success; spin_unlock(&head2->lock); goto next_port; } tb = NULL; goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; } return NULL; success: *port_ret = port; *tb_ret = tb; *tb2_ret = tb2; *head2_ret = head2; return head; } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; if (!uid_eq(tb->fastuid, uid)) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that * the fast_*rcv_saddr doesn't have any conflicts with the socks on the * owners list. */ if (tb->fastreuseport == FASTREUSEPORT_ANY) return 1; #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, ipv6_only_sock(sk), true, false); } void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; if (sk->sk_reuseport) { /* We didn't match or we don't have fastreuseport set on * the tb, but we have sk_reuseport set on this socket * and we know that there are no bind conflicts with * this socket in this tb, so reset our tb's reuseport * settings so that any subsequent sockets that match * our current socket will be put on the fast path. * * If we reset we need to set FASTREUSEPORT_STRICT so we * do extra checking for all subsequent sk_reuseport * socks. */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } } else { tb->fastreuseport = 0; } } } /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; head2_lock_acquired = true; if (tb && tb2) goto success; found_port = true; } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) goto fail_unlock; bhash_created = true; } if (!found_port) { if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) check_bind_conflict = false; } if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); head2_lock_acquired = true; tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); } if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; } if (!found_port && check_bind_conflict) { if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: if (ret) { if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } /* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); arg->is_empty = reqsk_queue_empty(queue); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); } if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } if (req) reqsk_put(req); if (newsk) inet_init_csk_locks(newsk); return newsk; out_err: newsk = NULL; req = NULL; arg->err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *t), void (*delack_handler)(struct timer_list *t), void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* ongoing timer handlers need to acquire socket lock. */ sock_not_owned_by_me(sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); sk_stop_timer_sync(sk, &sk->sk_timer); } void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt; struct rtable *rt; rcu_read_lock(); opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ static void syn_ack_recalc(struct request_sock *req, const int max_syn_ack_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } *expire = req->num_timeout >= max_syn_ack_retries && (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; } int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; return err; } EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { struct sock *req_sk, *nreq_sk; struct request_sock *nreq; nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; } req_sk = req_to_sk(req); nreq_sk = req_to_sk(nreq); memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end), /* alloc is larger than struct, see above */); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; #endif nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; nreq->rsk_listener = sk; /* We need not acquire fastopenq->lock * because the child socket is locked in inet_csk_listen_stop(). */ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); return nreq; } static void reqsk_queue_migrated(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static void reqsk_migrate_reset(struct request_sock *req) { req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; inet_rsk(req)->pktopts = NULL; #else inet_rsk(req)->ireq_opt = NULL; #endif } /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { struct sock *sk = req_to_sk(req); bool found = false; if (sk_hashed(sk)) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { bool unlinked = reqsk_queue_unlink(req); if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } return unlinked; } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; struct request_sock_queue *queue; struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { struct sock *nsk; nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); if (!nsk) goto drop; nreq = inet_reqsk_clone(req, nsk); if (!nreq) goto drop; /* The new timer for the cloned req can decrease the 2 * by calling inet_csk_reqsk_queue_drop_and_put(), so * hold another count to prevent use-after-free and * call reqsk_put() just before return. */ refcount_set(&nreq->rsk_refcnt, 2 + 1); timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); req = nreq; sk_listener = nsk; } icsk = inet_csk(sk_listener); net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (max_syn_ack_retries > 2) { if (qlen < young) break; max_syn_ack_retries--; young <<= 1; } } syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !inet_rtx_syn_ack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); if (!nreq) return; if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ inet_csk_reqsk_queue_drop(sk_listener, nreq); goto no_ownership; } __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); reqsk_put(nreq); return; } /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static bool reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { bool found_dup_sk = false; if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk)) return false; /* The timer needs to be setup after a successful insertion. */ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); return true; } bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { if (!reqsk_queue_hash_req(req, timeout)) return false; inet_csk_reqsk_queue_added(sk); return true; } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); if (!icsk->icsk_ulp_ops) return; icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone * @req: request_sock * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); inet_sk(newsk)->mc_list = NULL; newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); inet_clone_ulp(req, newsk, priority); security_inet_csk_clone(newsk, req); } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this * socket, and thus no user references at all. Therefore we * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ void inet_csk_destroy_sock(struct sock *sk) { WARN_ON(sk->sk_state != TCP_CLOSE); WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); static int inet_ulp_can_listen(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) return -EINVAL; return 0; } int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; err = inet_ulp_can_listen(sk); if (unlikely(err)) return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); err = sk->sk_prot->get_port(sk, inet->inet_num); if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); err = sk->sk_prot->hash(sk); if (likely(!err)) return 0; } inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) { sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is * blocked by sock lock in tcp_v4_rcv(). * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); child = NULL; } else { req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { inet_csk_reqsk_queue_drop(req->rsk_listener, req); reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); if (sk != req->rsk_listener) { /* another listening sk has been selected, * migrate the req to it. */ struct request_sock *nreq; /* hold a refcnt for the nreq->rsk_listener * which is assigned in inet_reqsk_clone() */ sock_hold(sk); nreq = inet_reqsk_clone(req, sk); if (!nreq) { inet_child_forget(sk, req, child); goto child_put; } refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; } } /* Too bad, another child took ownership of the request, undo. */ child_put: bh_unlock_sock(child); sock_put(child); return NULL; } EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) * or to send active reset (abort). * Certainly, it is pretty dangerous while synflood, but it is * bad justification for our negligence 8) * To be honest, we are not able to make either * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk, *nsk; struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); nsk = reuseport_migrate_sock(sk, child, NULL); if (nsk) { nreq = inet_reqsk_clone(req, nsk); if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. */ goto skip_child_forget; } } inet_child_forget(sk, req, child); skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); while (req != NULL) { next = req->dl_next; reqsk_put(req); req = next; } } WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; sin->sin_addr.s_addr = inet->inet_daddr; sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) sk_setup_caps(sk, &rt->dst); rcu_read_unlock(); return &rt->dst; } struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) { struct dst_entry *dst = __sk_dst_check(sk, 0); struct inet_sock *inet = inet_sk(sk); if (!dst) { dst = inet_csk_rebuild_route(sk, &inet->cork.fl); if (!dst) goto out; } dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) dst = inet_csk_rebuild_route(sk, &inet->cork.fl); out: return dst; } EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Filesystem parameter description and parser * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_FS_PARSER_H #define _LINUX_FS_PARSER_H #include <linux/fs_context.h> struct path; struct constant_table { const char *name; int value; }; struct fs_parameter_spec; struct fs_parse_result; typedef int fs_param_type(struct p_log *, const struct fs_parameter_spec *, struct fs_parameter *, struct fs_parse_result *); /* * The type of parameter expected. */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, fs_param_is_path, fs_param_is_fd; /* * Specification of the type of value a parameter wants. * * Note that the fsparam_flag(), fsparam_string(), fsparam_u32(), ... macros * should be used to generate elements of this type. */ struct fs_parameter_spec { const char *name; fs_param_type *type; /* The desired parameter type */ u8 opt; /* Option number (returned by fs_parse()) */ unsigned short flags; #define fs_param_neg_with_no 0x0002 /* "noxxx" is negative param */ #define fs_param_can_be_empty 0x0004 /* "xxx=" is allowed */ #define fs_param_deprecated 0x0008 /* The param is deprecated */ const void *data; }; /* * Result of parse. */ struct fs_parse_result { bool negated; /* T if param was "noxxx" */ union { bool boolean; /* For spec_bool */ int int_32; /* For spec_s32/spec_enum */ unsigned int uint_32; /* For spec_u32{,_octal,_hex}/spec_enum */ u64 uint_64; /* For spec_u64 */ }; }; extern int __fs_parse(struct p_log *log, const struct fs_parameter_spec *desc, struct fs_parameter *value, struct fs_parse_result *result); static inline int fs_parse(struct fs_context *fc, const struct fs_parameter_spec *desc, struct fs_parameter *param, struct fs_parse_result *result) { return __fs_parse(&fc->log, desc, param, result); } extern int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, unsigned int flags, struct path *_path); extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); #ifdef CONFIG_VALIDATE_FS_PARSER extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special); extern bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc); #else static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special) { return true; } static inline bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) { return true; } #endif /* * Parameter type, name, index and flags element constructors. Use as: * * fsparam_xxxx("foo", Opt_foo) * * If existing helpers are not enough, direct use of __fsparam() would * work, but any such case is probably a sign that new helper is needed. * Helpers will remain stable; low-level implementation may change. */ #define __fsparam(TYPE, NAME, OPT, FLAGS, DATA) \ { \ .name = NAME, \ .opt = OPT, \ .type = TYPE, \ .flags = FLAGS, \ .data = DATA \ } #define fsparam_flag(NAME, OPT) __fsparam(NULL, NAME, OPT, 0, NULL) #define fsparam_flag_no(NAME, OPT) \ __fsparam(NULL, NAME, OPT, fs_param_neg_with_no, NULL) #define fsparam_bool(NAME, OPT) __fsparam(fs_param_is_bool, NAME, OPT, 0, NULL) #define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL) #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) #define fsparam_u32hex(NAME, OPT) \ __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) #define fsparam_blob(NAME, OPT) __fsparam(fs_param_is_blob, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) #endif /* _LINUX_FS_PARSER_H */
1 1 1 1 1 2 2 2 1 1 4 3 3 2 2 2 2 2 2 2 2 2 1 4 4 4 4 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. */ #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/bpf.h> #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/idr.h> #include <linux/vmalloc.h> #include "xdp_umem.h" #include "xsk_queue.h" static DEFINE_IDA(umem_ida); static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kvfree(umem->pgs); umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { if (umem->user) { atomic_long_sub(umem->npgs, &umem->user->locked_vm); free_uid(umem->user); } } static void xdp_umem_addr_unmap(struct xdp_umem *umem) { vunmap(umem->addrs); umem->addrs = NULL; } static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages, u32 nr_pages) { umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!umem->addrs) return -ENOMEM; return 0; } static void xdp_umem_release(struct xdp_umem *umem) { umem->zc = false; ida_free(&umem_ida, umem->id); xdp_umem_addr_unmap(umem); xdp_umem_unpin_pages(umem); xdp_umem_unaccount_pages(umem); kfree(umem); } static void xdp_umem_release_deferred(struct work_struct *work) { struct xdp_umem *umem = container_of(work, struct xdp_umem, work); xdp_umem_release(umem); } void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) { if (!umem) return; if (refcount_dec_and_test(&umem->users)) { if (defer_cleanup) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } else { xdp_umem_release(umem); } } } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; int err; umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { if (npgs >= 0) { umem->npgs = npgs; err = -ENOMEM; goto out_pin; } err = npgs; goto out_pgs; } return 0; out_pin: xdp_umem_unpin_pages(umem); out_pgs: kvfree(umem->pgs); umem->pgs = NULL; return err; } static int xdp_umem_account_pages(struct xdp_umem *umem) { unsigned long lock_limit, new_npgs, old_npgs; if (capable(CAP_IPC_LOCK)) return 0; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; umem->user = get_uid(current_user()); do { old_npgs = atomic_long_read(&umem->user->locked_vm); new_npgs = old_npgs + umem->npgs; if (new_npgs > lock_limit) { free_uid(umem->user); umem->user = NULL; return -ENOBUFS; } } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, new_npgs) != old_npgs); return 0; } #define XDP_UMEM_FLAGS_VALID ( \ XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ XDP_UMEM_TX_SW_CSUM | \ 0) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; u64 addr = mr->addr, size = mr->len; u32 chunks_rem, npgs_rem; u64 chunks, npgs; int err; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or * - making sure the memory area is consecutive * but for now, we simply say "computer says no". */ return -EINVAL; } if (mr->flags & ~XDP_UMEM_FLAGS_VALID) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { /* Memory area has to be page size aligned. For * simplicity, this might change. */ return -EINVAL; } if ((addr + size) < addr) return -EINVAL; npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem); if (npgs_rem) npgs++; if (npgs > U32_MAX) return -EINVAL; chunks = div_u64_rem(size, chunk_size, &chunks_rem); if (!chunks || chunks > U32_MAX) return -EINVAL; if (!unaligned_chunks && chunks_rem) return -EINVAL; if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) return -EINVAL; umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; umem->chunks = chunks; umem->npgs = npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; umem->tx_metadata_len = mr->tx_metadata_len; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) return err; err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs); if (err) goto out_unpin; return 0; out_unpin: xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; } struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) { struct xdp_umem *umem; int err; umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); err = ida_alloc(&umem_ida, GFP_KERNEL); if (err < 0) { kfree(umem); return ERR_PTR(err); } umem->id = err; err = xdp_umem_reg(umem, mr); if (err) { ida_free(&umem_ida, umem->id); kfree(umem); return ERR_PTR(err); } return umem; }
127 128 127 128 127 126 128 28 28 26 28 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/dir.c - sysfs core and dir operation implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt #include <linux/fs.h> #include <linux/kobject.h> #include <linux/slab.h> #include "sysfs.h" DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) kernfs_path(parent, buf, PATH_MAX); pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name); dump_stack(); kfree(buf); } /** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { struct kernfs_node *parent, *kn; kuid_t uid; kgid_t gid; if (WARN_ON(!kobj)) return -EINVAL; if (kobj->parent) parent = kobj->parent->sd; else parent = sysfs_root_kn; if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn); } kobj->sd = kn; return 0; } /** * sysfs_remove_dir - remove an object's directory. * @kobj: object. * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ void sysfs_remove_dir(struct kobject *kobj) { struct kernfs_node *kn = kobj->sd; /* * In general, kobject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no * control over removal. @kobj->sd may be removed anytime * and symlink code may end up dereferencing an already freed node. * * sysfs_symlink_target_lock synchronizes @kobj->sd * disassociation against symlink operations so that symlink code * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); if (kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); kernfs_remove(kn); } } int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { struct kernfs_node *parent; int ret; parent = kernfs_get_parent(kobj->sd); ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); kernfs_put(parent); return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } /** * sysfs_create_mount_point - create an always empty directory * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to add */ int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *kn, *parent = parent_kobj->sd; kn = kernfs_create_empty_dir(parent, name); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } return 0; } EXPORT_SYMBOL_GPL(sysfs_create_mount_point); /** * sysfs_remove_mount_point - remove an always empty directory. * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to remove * */ void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *parent = parent_kobj->sd; kernfs_remove_by_name_ns(parent, name, NULL); } EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
33 32 33 33 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "nop.h" struct io_nop { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct file *file; int result; }; int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned int flags; struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); flags = READ_ONCE(sqe->nop_flags); if (flags & ~IORING_NOP_INJECT_RESULT) return -EINVAL; if (flags & IORING_NOP_INJECT_RESULT) nop->result = READ_ONCE(sqe->len); else nop->result = 0; return 0; } int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); if (nop->result < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); return IOU_OK; }
30 29 9 30 9 9 14 13 14 30 16 26 30 29 13 30 30 30 30 2 30 29 28 28 29 28 29 29 29 10 10 9 10 10 10 21 21 20 27 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 10 13 14 13 14 14 14 14 14 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/sched.h> #include <linux/random.h> #include <linux/sbitmap.h> #include <linux/seq_file.h> static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) { unsigned depth = sb->depth; sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); if (!sb->alloc_hint) return -ENOMEM; if (depth && !sb->round_robin) { int i; for_each_possible_cpu(i) *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, unsigned int depth) { unsigned hint; hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } return hint; } static inline void update_alloc_hint_after_get(struct sbitmap *sb, unsigned int depth, unsigned int hint, unsigned int nr) { if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1) hint = 0; this_cpu_write(*sb->alloc_hint, hint); } } /* * See if we have deferred clears that we can batch move */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask; if (!READ_ONCE(map->cleared)) return false; /* * First get a stable cleared mask, setting the old mask to 0. */ mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ atomic_long_andnot(mask, (atomic_long_t *)&map->word); BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint) { unsigned int bits_per_word; if (shift < 0) shift = sbitmap_calculate_shift(depth); bits_per_word = 1U << shift; if (bits_per_word > BITS_PER_LONG) return -EINVAL; sb->shift = shift; sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); sb->round_robin = round_robin; if (depth == 0) { sb->map = NULL; return 0; } if (alloc_hint) { if (init_alloc_hint(sb, flags)) return -ENOMEM; } else { sb->alloc_hint = NULL; } sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; } return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); void sbitmap_resize(struct sbitmap *sb, unsigned int depth) { unsigned int bits_per_word = 1U << sb->shift; unsigned int i; for (i = 0; i < sb->map_nr; i++) sbitmap_deferred_clear(&sb->map[i]); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); } EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { int nr; /* don't wrap if starting from 0 */ wrap = wrap && hint; while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ if (hint && wrap) { hint = 0; continue; } return -1; } if (!test_and_set_bit_lock(nr, word)) break; hint = nr + 1; if (hint >= depth - 1) hint = 0; } return nr; } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { int nr; do { nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); if (nr != -1) break; if (!sbitmap_deferred_clear(map)) break; } while (1); return nr; } static int sbitmap_find_bit(struct sbitmap *sb, unsigned int depth, unsigned int index, unsigned int alloc_hint, bool wrap) { unsigned int i; int nr = -1; for (i = 0; i < sb->map_nr; i++) { nr = sbitmap_find_bit_in_word(&sb->map[index], min_t(unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ alloc_hint = 0; if (++index >= sb->map_nr) index = 0; } return nr; } static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); /* * Unless we're doing round robin tag allocation, just use the * alloc_hint to find the right word index. No point in looping * twice in find_next_zero_bit() for that case. */ if (sb->round_robin) alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); else alloc_hint = 0; return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint, !sb->round_robin); } int sbitmap_get(struct sbitmap *sb) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get(sb, hint); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get); static int __sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); } int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get_shallow(sb, hint, shallow_depth); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get_shallow); bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; } EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; unsigned int word_depth = __map_depth(sb, i); if (set) weight += bitmap_weight(&word->word, word_depth); else weight += bitmap_weight(&word->cleared, word_depth); } return weight; } static unsigned int sbitmap_cleared(const struct sbitmap *sb) { return __sbitmap_weight(sb, false); } unsigned int sbitmap_weight(const struct sbitmap *sb) { return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); } EXPORT_SYMBOL_GPL(sbitmap_weight); void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } EXPORT_SYMBOL_GPL(sbitmap_show); static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) { if ((offset & 0xf) == 0) { if (offset != 0) seq_putc(m, '\n'); seq_printf(m, "%08x:", offset); } if ((offset & 0x1) == 0) seq_putc(m, ' '); seq_printf(m, "%02x", byte); } void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) { u8 byte = 0; unsigned int byte_bits = 0; unsigned int offset = 0; int i; for (i = 0; i < sb->map_nr; i++) { unsigned long word = READ_ONCE(sb->map[i].word); unsigned long cleared = READ_ONCE(sb->map[i].cleared); unsigned int word_bits = __map_depth(sb, i); word &= ~cleared; while (word_bits > 0) { unsigned int bits = min(8 - byte_bits, word_bits); byte |= (word & (BIT(bits) - 1)) << byte_bits; byte_bits += bits; if (byte_bits == 8) { emit_byte(m, offset, byte); byte = 0; byte_bits = 0; offset++; } word >>= bits; word_bits -= bits; } } if (byte_bits) { emit_byte(m, offset, byte); offset++; } if (offset) seq_putc(m, '\n'); } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; unsigned int shallow_depth; /* * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: * * bits_per_word = 1 << shift * depth / bits_per_word = depth >> shift * depth % bits_per_word = depth & ((1 << shift) - 1) * * Each word can be limited to sbq->min_shallow_depth bits. */ shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); depth = ((depth >> sbq->sb.shift) * shallow_depth + min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); return wake_batch; } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, round_robin, true); if (ret) return ret; sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); atomic_set(&sbq->completion_cnt, 0); atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; } for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); if (sbq->wake_batch != wake_batch) WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; unsigned int depth = (sbq->sb.depth + users - 1) / users; wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { return sbitmap_get(&sbq->sb); } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset) { struct sbitmap *sb = &sbq->sb; unsigned int hint, depth; unsigned long index, nr; int i; if (unlikely(sb->round_robin)) return 0; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); index = SB_NR_TO_INDEX(sb, hint); for (i = 0; i < sb->map_nr; i++) { struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); unsigned long val; sbitmap_deferred_clear(map); val = READ_ONCE(map->word); if (val == (1UL << (map_depth - 1)) - 1) goto next; nr = find_first_zero_bit(&val, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; get_mask = ((1UL << nr_tags) - 1) << nr; while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ; get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, *offset + nr_tags - 1); return get_mask; } } next: /* Jump to next index. */ if (++index >= sb->map_nr) index = 0; } return 0; } int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow); void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth) { sbq->min_shallow_depth = min_shallow_depth; sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; /* * Advance the index before checking the current queue. * It improves fairness, by ensuring the queue doesn't * need to be fully emptied before trying to wake up * from the next one. */ wake_index = sbq_index_inc(wake_index); if (waitqueue_active(&ws->wait)) { woken = wake_up_nr(&ws->wait, nr); if (woken == nr) break; nr -= woken; } } if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) return; atomic_add(nr, &sbq->completion_cnt); wakeups = atomic_read(&sbq->wakeup_cnt); do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) { if (likely(!sb->round_robin && tag < sb->depth)) data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); } void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags) { struct sbitmap *sb = &sbq->sb; unsigned long *addr = NULL; unsigned long mask = 0; int i; smp_mb__before_atomic(); for (i = 0; i < nr_tags; i++) { const int tag = tags[i] - offset; unsigned long *this_addr; /* since we're clearing a batch, skip the deferred map */ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; if (!addr) { addr = this_addr; } else if (addr != this_addr) { atomic_long_andnot(mask, (atomic_long_t *) addr); mask = 0; addr = this_addr; } mask |= (1UL << SB_NR_TO_BIT(sb, tag)); } if (mask) atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { /* * Once the clear bit is set, the bit may be allocated out. * * Orders READ/WRITE on the associated instance(such as request * of blk_mq) by this bit for avoiding race with re-allocation, * and its pair is the memory barrier implied in __sbitmap_get_word. * * One invariant is that the clear bit has to be zero when the bit * is in use. */ smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) { int i, wake_index; /* * Pairs with the memory barrier in set_current_state() like in * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) wake_up(&ws->wait); wake_index = sbq_index_inc(wake_index); } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) { bool first; int i; sbitmap_show(&sbq->sb, m); seq_puts(m, "alloc_hint={"); first = true; for_each_possible_cpu(i) { if (!first) seq_puts(m, ", "); first = false; seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); } seq_puts(m, "}\n"); seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { if (!sbq_wait->sbq) { sbq_wait->sbq = sbq; atomic_inc(&sbq->ws_active); add_wait_queue(&ws->wait, &sbq_wait->wait); } } EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) { list_del_init(&sbq_wait->wait.entry); if (sbq_wait->sbq) { atomic_dec(&sbq_wait->sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
1 13 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0-or-later /* * Key-agreement Protocol Primitives (KPP) * * Copyright (c) 2016, Intel Corporation * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com> */ #include <crypto/internal/kpp.h> #include <linux/cryptouser.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static int __maybe_unused crypto_kpp_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_kpp rkpp; memset(&rkpp, 0, sizeof(rkpp)); strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(rkpp), &rkpp); } static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : kpp\n"); } static void crypto_kpp_exit_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); alg->exit(kpp); } static int crypto_kpp_init_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); if (alg->exit) kpp->base.exit = crypto_kpp_exit_tfm; if (alg->init) return alg->init(kpp); return 0; } static void crypto_kpp_free_instance(struct crypto_instance *inst) { struct kpp_instance *kpp = kpp_instance(inst); kpp->free(kpp); } static const struct crypto_type crypto_kpp_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_kpp_init_tfm, .free = crypto_kpp_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_kpp_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_kpp_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_KPP, .tfmsize = offsetof(struct crypto_kpp, base), }; struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_kpp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_kpp); int crypto_grab_kpp(struct crypto_kpp_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_kpp_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_kpp); int crypto_has_kpp(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_kpp); static void kpp_prepare_alg(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; base->cra_type = &crypto_kpp_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_KPP; } int crypto_register_kpp(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; kpp_prepare_alg(alg); return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_kpp); void crypto_unregister_kpp(struct kpp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_kpp); int kpp_register_instance(struct crypto_template *tmpl, struct kpp_instance *inst) { if (WARN_ON(!inst->free)) return -EINVAL; kpp_prepare_alg(&inst->alg); return crypto_register_instance(tmpl, kpp_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(kpp_register_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Key-agreement Protocol Primitives");
2 2 2 2 2 2 2 1 1 1 1 6 2 2 2 2 2 2 1 1 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 6 6 6 5 1 1 1 2 1 1 3 7 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 // SPDX-License-Identifier: GPL-2.0-or-later /* * User-space I/O driver support for HID subsystem * Copyright (c) 2012 David Herrmann */ /* */ #include <linux/atomic.h> #include <linux/compat.h> #include <linux/cred.h> #include <linux/device.h> #include <linux/fs.h> #include <linux/hid.h> #include <linux/input.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/uhid.h> #include <linux/wait.h> #define UHID_NAME "uhid" #define UHID_BUFSIZE 32 struct uhid_device { struct mutex devlock; /* This flag tracks whether the HID device is usable for commands from * userspace. The flag is already set before hid_add_device(), which * runs in workqueue context, to allow hid_add_device() to communicate * with userspace. * However, if hid_add_device() fails, the flag is cleared without * holding devlock. * We guarantee that if @running changes from true to false while you're * holding @devlock, it's still fine to access @hid. */ bool running; __u8 *rd_data; uint rd_size; /* When this is NULL, userspace may use UHID_CREATE/UHID_CREATE2. */ struct hid_device *hid; struct uhid_event input_buf; wait_queue_head_t waitq; spinlock_t qlock; __u8 head; __u8 tail; struct uhid_event *outq[UHID_BUFSIZE]; /* blocking GET_REPORT support; state changes protected by qlock */ struct mutex report_lock; wait_queue_head_t report_wait; bool report_running; u32 report_id; u32 report_type; struct uhid_event report_buf; struct work_struct worker; }; static struct miscdevice uhid_misc; static void uhid_device_add_worker(struct work_struct *work) { struct uhid_device *uhid = container_of(work, struct uhid_device, worker); int ret; ret = hid_add_device(uhid->hid); if (ret) { hid_err(uhid->hid, "Cannot register HID device: error %d\n", ret); /* We used to call hid_destroy_device() here, but that's really * messy to get right because we have to coordinate with * concurrent writes from userspace that might be in the middle * of using uhid->hid. * Just leave uhid->hid as-is for now, and clean it up when * userspace tries to close or reinitialize the uhid instance. * * However, we do have to clear the ->running flag and do a * wakeup to make sure userspace knows that the device is gone. */ WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); } } static void uhid_queue(struct uhid_device *uhid, struct uhid_event *ev) { __u8 newhead; newhead = (uhid->head + 1) % UHID_BUFSIZE; if (newhead != uhid->tail) { uhid->outq[uhid->head] = ev; uhid->head = newhead; wake_up_interruptible(&uhid->waitq); } else { hid_warn(uhid->hid, "Output queue is full\n"); kfree(ev); } } static int uhid_queue_event(struct uhid_device *uhid, __u32 event) { unsigned long flags; struct uhid_event *ev; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = event; spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return 0; } static int uhid_hid_start(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; struct uhid_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_START; if (hid->report_enum[HID_FEATURE_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_FEATURE_REPORTS; if (hid->report_enum[HID_OUTPUT_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_OUTPUT_REPORTS; if (hid->report_enum[HID_INPUT_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_INPUT_REPORTS; spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return 0; } static void uhid_hid_stop(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; hid->claimed = 0; uhid_queue_event(uhid, UHID_STOP); } static int uhid_hid_open(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; return uhid_queue_event(uhid, UHID_OPEN); } static void uhid_hid_close(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; uhid_queue_event(uhid, UHID_CLOSE); } static int uhid_hid_parse(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; return hid_parse_report(hid, uhid->rd_data, uhid->rd_size); } /* must be called with report_lock held */ static int __uhid_report_queue_and_wait(struct uhid_device *uhid, struct uhid_event *ev, __u32 *report_id) { unsigned long flags; int ret; spin_lock_irqsave(&uhid->qlock, flags); *report_id = ++uhid->report_id; uhid->report_type = ev->type + 1; uhid->report_running = true; uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); ret = wait_event_interruptible_timeout(uhid->report_wait, !uhid->report_running || !READ_ONCE(uhid->running), 5 * HZ); if (!ret || !READ_ONCE(uhid->running) || uhid->report_running) ret = -EIO; else if (ret < 0) ret = -ERESTARTSYS; else ret = 0; uhid->report_running = false; return ret; } static void uhid_report_wake_up(struct uhid_device *uhid, u32 id, const struct uhid_event *ev) { unsigned long flags; spin_lock_irqsave(&uhid->qlock, flags); /* id for old report; drop it silently */ if (uhid->report_type != ev->type || uhid->report_id != id) goto unlock; if (!uhid->report_running) goto unlock; memcpy(&uhid->report_buf, ev, sizeof(*ev)); uhid->report_running = false; wake_up_interruptible(&uhid->report_wait); unlock: spin_unlock_irqrestore(&uhid->qlock, flags); } static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum, u8 *buf, size_t count, u8 rtype) { struct uhid_device *uhid = hid->driver_data; struct uhid_get_report_reply_req *req; struct uhid_event *ev; int ret; if (!READ_ONCE(uhid->running)) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_GET_REPORT; ev->u.get_report.rnum = rnum; ev->u.get_report.rtype = rtype; ret = mutex_lock_interruptible(&uhid->report_lock); if (ret) { kfree(ev); return ret; } /* this _always_ takes ownership of @ev */ ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.get_report.id); if (ret) goto unlock; req = &uhid->report_buf.u.get_report_reply; if (req->err) { ret = -EIO; } else { ret = min3(count, (size_t)req->size, (size_t)UHID_DATA_MAX); memcpy(buf, req->data, ret); } unlock: mutex_unlock(&uhid->report_lock); return ret; } static int uhid_hid_set_report(struct hid_device *hid, unsigned char rnum, const u8 *buf, size_t count, u8 rtype) { struct uhid_device *uhid = hid->driver_data; struct uhid_event *ev; int ret; if (!READ_ONCE(uhid->running) || count > UHID_DATA_MAX) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_SET_REPORT; ev->u.set_report.rnum = rnum; ev->u.set_report.rtype = rtype; ev->u.set_report.size = count; memcpy(ev->u.set_report.data, buf, count); ret = mutex_lock_interruptible(&uhid->report_lock); if (ret) { kfree(ev); return ret; } /* this _always_ takes ownership of @ev */ ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.set_report.id); if (ret) goto unlock; if (uhid->report_buf.u.set_report_reply.err) ret = -EIO; else ret = count; unlock: mutex_unlock(&uhid->report_lock); return ret; } static int uhid_hid_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype) { u8 u_rtype; switch (rtype) { case HID_FEATURE_REPORT: u_rtype = UHID_FEATURE_REPORT; break; case HID_OUTPUT_REPORT: u_rtype = UHID_OUTPUT_REPORT; break; case HID_INPUT_REPORT: u_rtype = UHID_INPUT_REPORT; break; default: return -EINVAL; } switch (reqtype) { case HID_REQ_GET_REPORT: return uhid_hid_get_report(hid, reportnum, buf, len, u_rtype); case HID_REQ_SET_REPORT: return uhid_hid_set_report(hid, reportnum, buf, len, u_rtype); default: return -EIO; } } static int uhid_hid_output_raw(struct hid_device *hid, __u8 *buf, size_t count, unsigned char report_type) { struct uhid_device *uhid = hid->driver_data; __u8 rtype; unsigned long flags; struct uhid_event *ev; switch (report_type) { case HID_FEATURE_REPORT: rtype = UHID_FEATURE_REPORT; break; case HID_OUTPUT_REPORT: rtype = UHID_OUTPUT_REPORT; break; default: return -EINVAL; } if (count < 1 || count > UHID_DATA_MAX) return -EINVAL; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_OUTPUT; ev->u.output.size = count; ev->u.output.rtype = rtype; memcpy(ev->u.output.data, buf, count); spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return count; } static int uhid_hid_output_report(struct hid_device *hid, __u8 *buf, size_t count) { return uhid_hid_output_raw(hid, buf, count, HID_OUTPUT_REPORT); } static const struct hid_ll_driver uhid_hid_driver = { .start = uhid_hid_start, .stop = uhid_hid_stop, .open = uhid_hid_open, .close = uhid_hid_close, .parse = uhid_hid_parse, .raw_request = uhid_hid_raw_request, .output_report = uhid_hid_output_report, .max_buffer_size = UHID_DATA_MAX, }; #ifdef CONFIG_COMPAT /* Apparently we haven't stepped on these rakes enough times yet. */ struct uhid_create_req_compat { __u8 name[128]; __u8 phys[64]; __u8 uniq[64]; compat_uptr_t rd_data; __u16 rd_size; __u16 bus; __u32 vendor; __u32 product; __u32 version; __u32 country; } __attribute__((__packed__)); static int uhid_event_from_user(const char __user *buffer, size_t len, struct uhid_event *event) { if (in_compat_syscall()) { u32 type; if (get_user(type, buffer)) return -EFAULT; if (type == UHID_CREATE) { /* * This is our messed up request with compat pointer. * It is largish (more than 256 bytes) so we better * allocate it from the heap. */ struct uhid_create_req_compat *compat; compat = kzalloc(sizeof(*compat), GFP_KERNEL); if (!compat) return -ENOMEM; buffer += sizeof(type); len -= sizeof(type); if (copy_from_user(compat, buffer, min(len, sizeof(*compat)))) { kfree(compat); return -EFAULT; } /* Shuffle the data over to proper structure */ event->type = type; memcpy(event->u.create.name, compat->name, sizeof(compat->name)); memcpy(event->u.create.phys, compat->phys, sizeof(compat->phys)); memcpy(event->u.create.uniq, compat->uniq, sizeof(compat->uniq)); event->u.create.rd_data = compat_ptr(compat->rd_data); event->u.create.rd_size = compat->rd_size; event->u.create.bus = compat->bus; event->u.create.vendor = compat->vendor; event->u.create.product = compat->product; event->u.create.version = compat->version; event->u.create.country = compat->country; kfree(compat); return 0; } /* All others can be copied directly */ } if (copy_from_user(event, buffer, min(len, sizeof(*event)))) return -EFAULT; return 0; } #else static int uhid_event_from_user(const char __user *buffer, size_t len, struct uhid_event *event) { if (copy_from_user(event, buffer, min(len, sizeof(*event)))) return -EFAULT; return 0; } #endif static int uhid_dev_create2(struct uhid_device *uhid, const struct uhid_event *ev) { struct hid_device *hid; size_t rd_size; void *rd_data; int ret; if (uhid->hid) return -EALREADY; rd_size = ev->u.create2.rd_size; if (rd_size <= 0 || rd_size > HID_MAX_DESCRIPTOR_SIZE) return -EINVAL; rd_data = kmemdup(ev->u.create2.rd_data, rd_size, GFP_KERNEL); if (!rd_data) return -ENOMEM; uhid->rd_size = rd_size; uhid->rd_data = rd_data; hid = hid_allocate_device(); if (IS_ERR(hid)) { ret = PTR_ERR(hid); goto err_free; } BUILD_BUG_ON(sizeof(hid->name) != sizeof(ev->u.create2.name)); strscpy(hid->name, ev->u.create2.name, sizeof(hid->name)); BUILD_BUG_ON(sizeof(hid->phys) != sizeof(ev->u.create2.phys)); strscpy(hid->phys, ev->u.create2.phys, sizeof(hid->phys)); BUILD_BUG_ON(sizeof(hid->uniq) != sizeof(ev->u.create2.uniq)); strscpy(hid->uniq, ev->u.create2.uniq, sizeof(hid->uniq)); hid->ll_driver = &uhid_hid_driver; hid->bus = ev->u.create2.bus; hid->vendor = ev->u.create2.vendor; hid->product = ev->u.create2.product; hid->version = ev->u.create2.version; hid->country = ev->u.create2.country; hid->driver_data = uhid; hid->dev.parent = uhid_misc.this_device; uhid->hid = hid; uhid->running = true; /* Adding of a HID device is done through a worker, to allow HID drivers * which use feature requests during .probe to work, without they would * be blocked on devlock, which is held by uhid_char_write. */ schedule_work(&uhid->worker); return 0; err_free: kfree(uhid->rd_data); uhid->rd_data = NULL; uhid->rd_size = 0; return ret; } static int uhid_dev_create(struct uhid_device *uhid, struct uhid_event *ev) { struct uhid_create_req orig; orig = ev->u.create; if (orig.rd_size <= 0 || orig.rd_size > HID_MAX_DESCRIPTOR_SIZE) return -EINVAL; if (copy_from_user(&ev->u.create2.rd_data, orig.rd_data, orig.rd_size)) return -EFAULT; memcpy(ev->u.create2.name, orig.name, sizeof(orig.name)); memcpy(ev->u.create2.phys, orig.phys, sizeof(orig.phys)); memcpy(ev->u.create2.uniq, orig.uniq, sizeof(orig.uniq)); ev->u.create2.rd_size = orig.rd_size; ev->u.create2.bus = orig.bus; ev->u.create2.vendor = orig.vendor; ev->u.create2.product = orig.product; ev->u.create2.version = orig.version; ev->u.create2.country = orig.country; return uhid_dev_create2(uhid, ev); } static int uhid_dev_destroy(struct uhid_device *uhid) { if (!uhid->hid) return -EINVAL; WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); cancel_work_sync(&uhid->worker); hid_destroy_device(uhid->hid); uhid->hid = NULL; kfree(uhid->rd_data); return 0; } static int uhid_dev_input(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input.data, min_t(size_t, ev->u.input.size, UHID_DATA_MAX), 0); return 0; } static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input2.data, min_t(size_t, ev->u.input2.size, UHID_DATA_MAX), 0); return 0; } static int uhid_dev_get_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.get_report_reply.id, ev); return 0; } static int uhid_dev_set_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.set_report_reply.id, ev); return 0; } static int uhid_char_open(struct inode *inode, struct file *file) { struct uhid_device *uhid; uhid = kzalloc(sizeof(*uhid), GFP_KERNEL); if (!uhid) return -ENOMEM; mutex_init(&uhid->devlock); mutex_init(&uhid->report_lock); spin_lock_init(&uhid->qlock); init_waitqueue_head(&uhid->waitq); init_waitqueue_head(&uhid->report_wait); uhid->running = false; INIT_WORK(&uhid->worker, uhid_device_add_worker); file->private_data = uhid; stream_open(inode, file); return 0; } static int uhid_char_release(struct inode *inode, struct file *file) { struct uhid_device *uhid = file->private_data; unsigned int i; uhid_dev_destroy(uhid); for (i = 0; i < UHID_BUFSIZE; ++i) kfree(uhid->outq[i]); kfree(uhid); return 0; } static ssize_t uhid_char_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct uhid_device *uhid = file->private_data; int ret; unsigned long flags; size_t len; /* they need at least the "type" member of uhid_event */ if (count < sizeof(__u32)) return -EINVAL; try_again: if (file->f_flags & O_NONBLOCK) { if (uhid->head == uhid->tail) return -EAGAIN; } else { ret = wait_event_interruptible(uhid->waitq, uhid->head != uhid->tail); if (ret) return ret; } ret = mutex_lock_interruptible(&uhid->devlock); if (ret) return ret; if (uhid->head == uhid->tail) { mutex_unlock(&uhid->devlock); goto try_again; } else { len = min(count, sizeof(**uhid->outq)); if (copy_to_user(buffer, uhid->outq[uhid->tail], len)) { ret = -EFAULT; } else { kfree(uhid->outq[uhid->tail]); uhid->outq[uhid->tail] = NULL; spin_lock_irqsave(&uhid->qlock, flags); uhid->tail = (uhid->tail + 1) % UHID_BUFSIZE; spin_unlock_irqrestore(&uhid->qlock, flags); } } mutex_unlock(&uhid->devlock); return ret ? ret : len; } static ssize_t uhid_char_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct uhid_device *uhid = file->private_data; int ret; size_t len; /* we need at least the "type" member of uhid_event */ if (count < sizeof(__u32)) return -EINVAL; ret = mutex_lock_interruptible(&uhid->devlock); if (ret) return ret; memset(&uhid->input_buf, 0, sizeof(uhid->input_buf)); len = min(count, sizeof(uhid->input_buf)); ret = uhid_event_from_user(buffer, len, &uhid->input_buf); if (ret) goto unlock; switch (uhid->input_buf.type) { case UHID_CREATE: /* * 'struct uhid_create_req' contains a __user pointer which is * copied from, so it's unsafe to allow this with elevated * privileges (e.g. from a setuid binary) or via kernel_write(). */ if (file->f_cred != current_cred()) { pr_err_once("UHID_CREATE from different security context by process %d (%s), this is not allowed.\n", task_tgid_vnr(current), current->comm); ret = -EACCES; goto unlock; } ret = uhid_dev_create(uhid, &uhid->input_buf); break; case UHID_CREATE2: ret = uhid_dev_create2(uhid, &uhid->input_buf); break; case UHID_DESTROY: ret = uhid_dev_destroy(uhid); break; case UHID_INPUT: ret = uhid_dev_input(uhid, &uhid->input_buf); break; case UHID_INPUT2: ret = uhid_dev_input2(uhid, &uhid->input_buf); break; case UHID_GET_REPORT_REPLY: ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf); break; case UHID_SET_REPORT_REPLY: ret = uhid_dev_set_report_reply(uhid, &uhid->input_buf); break; default: ret = -EOPNOTSUPP; } unlock: mutex_unlock(&uhid->devlock); /* return "count" not "len" to not confuse the caller */ return ret ? ret : count; } static __poll_t uhid_char_poll(struct file *file, poll_table *wait) { struct uhid_device *uhid = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* uhid is always writable */ poll_wait(file, &uhid->waitq, wait); if (uhid->head != uhid->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static const struct file_operations uhid_fops = { .owner = THIS_MODULE, .open = uhid_char_open, .release = uhid_char_release, .read = uhid_char_read, .write = uhid_char_write, .poll = uhid_char_poll, .llseek = no_llseek, }; static struct miscdevice uhid_misc = { .fops = &uhid_fops, .minor = UHID_MINOR, .name = UHID_NAME, }; module_misc_device(uhid_misc); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>"); MODULE_DESCRIPTION("User-space I/O driver support for HID subsystem"); MODULE_ALIAS_MISCDEV(UHID_MINOR); MODULE_ALIAS("devname:" UHID_NAME);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 58 57 57 58 58 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * * Authors: David S. Miller (davem@caip.rutgers.edu) * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/socket.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/tcp.h> #include <net/udp.h> #include <net/transp_v6.h> #include <net/ipv6.h> #define MAX4(a, b, c, d) \ max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", sock_prot_inuse_get(net, &udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv6.fqdir)); return 0; } static const struct snmp_mib snmp6_ipstats_list[] = { /* ipv6 mib according to RFC 2465 */ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { /* icmpv6 mib according to RFC 2466 */ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; /* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ static const char *const icmp6type2name[256] = { [ICMPV6_DEST_UNREACH] = "DestUnreachs", [ICMPV6_PKT_TOOBIG] = "PktTooBigs", [ICMPV6_TIME_EXCEED] = "TimeExcds", [ICMPV6_PARAMPROB] = "ParmProblems", [ICMPV6_ECHO_REQUEST] = "Echos", [ICMPV6_ECHO_REPLY] = "EchoReplies", [ICMPV6_MGM_QUERY] = "GroupMembQueries", [ICMPV6_MGM_REPORT] = "GroupMembResponses", [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", [ICMPV6_MLD2_REPORT] = "MLDv2Reports", [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", [NDISC_REDIRECT] = "Redirects", }; static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; int i; /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { int icmptype; const char *p; icmptype = i & 0xff; p = icmp6type2name[icmptype]; if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; val = atomic_long_read(smib + i); if (!val) continue; snprintf(name, sizeof(name), "Icmp6%sType%u", i & 0x100 ? "Out" : "In", i & 0xff); seq_printf(seq, "%-32s\t%lu\n", name, val); } } /* can be called either with percpu mib (pcpumib != NULL), * or shared one (smib != NULL) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); snmp_get_cpu_field_batch(buff, itemlist, pcpumib); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, NULL, snmp6_udp6_list); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, NULL, snmp6_udplite6_list); return 0; } static int snmp6_dev_seq_show(struct seq_file *seq, void *v) { struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } int snmp6_register_dev(struct inet6_dev *idev) { struct proc_dir_entry *p; struct net *net; if (!idev || !idev->dev) return -EINVAL; net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; p = proc_create_single_data(idev->dev->name, 0444, net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev); if (!p) return -ENOMEM; idev->stats.proc_dir_entry = p; return 0; } int snmp6_unregister_dev(struct inet6_dev *idev) { struct net *net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; if (!idev->stats.proc_dir_entry) return -EINVAL; proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } static int __net_init ipv6_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat6", 0444, net->proc_net, sockstat6_seq_show, NULL)) return -ENOMEM; if (!proc_create_net_single("snmp6", 0444, net->proc_net, snmp6_seq_show, NULL)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); if (!net->mib.proc_net_devsnmp6) goto proc_dev_snmp6_fail; return 0; proc_dev_snmp6_fail: remove_proc_entry("snmp6", net->proc_net); proc_snmp6_fail: remove_proc_entry("sockstat6", net->proc_net); return -ENOMEM; } static void __net_exit ipv6_proc_exit_net(struct net *net) { remove_proc_entry("sockstat6", net->proc_net); remove_proc_entry("dev_snmp6", net->proc_net); remove_proc_entry("snmp6", net->proc_net); } static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, }; int __init ipv6_misc_proc_init(void) { return register_pernet_subsys(&ipv6_proc_ops); } void ipv6_misc_proc_exit(void) { unregister_pernet_subsys(&ipv6_proc_ops); }
6 1 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 1 2 2 2 2 2 1 1 2 2 2 6 6 6 6 6 2 2 2 2 2 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 1 2 1 1 2 2 7 6 6 6 6 6 2 6 6 6 6 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/inotify_user.c - inotify support for userspace * * Authors: * John McCutchan <ttb@tentacle.dhs.org> * Robert Love <rml@novell.com> * * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * inotify was largely rewriten to make use of the fsnotify infrastructure */ #include <linux/file.h> #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> #include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ #include <linux/sched/signal.h> #include <linux/slab.h> /* struct kmem_cache */ #include <linux/syscalls.h> #include <linux/types.h> #include <linux/anon_inodes.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/wait.h> #include <linux/memcontrol.h> #include <linux/security.h> #include "inotify.h" #include "../fdinfo.h" #include <asm/ioctls.h> /* * An inotify watch requires allocating an inotify_inode_mark structure as * well as pinning the watched inode. Doubling the size of a VFS inode * should be more than enough to cover the additional filesystem inode * size increase. */ #define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \ 2 * sizeof(struct inode)) /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long it_zero = 0; static long it_int_max = INT_MAX; static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &it_zero, .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &it_zero, .extra2 = &it_int_max, }, { .procname = "max_queued_events", .data = &inotify_max_queued_events, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, }; static void __init inotify_sysctls_init(void) { register_sysctl("fs/inotify", inotify_table); } #else #define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) { __u32 mask; /* * Everything should receive events when the inode is unmounted. * All directories care about children. */ mask = (FS_UNMOUNT); if (S_ISDIR(inode->i_mode)) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ mask |= (arg & INOTIFY_USER_MASK); return mask; } #define INOTIFY_MARK_FLAGS \ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT) static inline unsigned int inotify_arg_to_flags(u32 arg) { unsigned int flags = 0; if (arg & IN_EXCL_UNLINK) flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK; if (arg & IN_ONESHOT) flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT; return flags; } static inline u32 inotify_mask_to_arg(__u32 mask) { return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | IN_Q_OVERFLOW); } /* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = EPOLLIN | EPOLLRDNORM; spin_unlock(&group->notification_lock); return ret; } static int round_event_name_len(struct fsnotify_event *fsn_event) { struct inotify_event_info *event; event = INOTIFY_E(fsn_event); if (!event->name_len) return 0; return roundup(event->name_len + 1, sizeof(struct inotify_event)); } /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if * not large enough. * * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size = sizeof(struct inotify_event); struct fsnotify_event *event; event = fsnotify_peek_first_event(group); if (!event) return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); return event; } /* * Copy an event to user space, returning how much we copied. * * We already checked that the event size is smaller than the * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); size_t name_len; size_t pad_name_len; pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); event = INOTIFY_E(fsn_event); name_len = event->name_len; /* * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ if (copy_to_user(buf, &inotify_event, event_size)) return -EFAULT; buf += event_size; /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) * with zeros. */ if (pad_name_len) { /* copy the path name */ if (copy_to_user(buf, event->name, name_len)) return -EFAULT; buf += name_len; /* fill userspace with 0's */ if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; event_size += pad_name_len; } return event_size; } static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; struct fsnotify_event *kevent; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; add_wait_queue(&group->notification_waitq, &wait); while (1) { spin_lock(&group->notification_lock); kevent = get_one_event(group, count); spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); if (kevent) { ret = PTR_ERR(kevent); if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; count -= ret; continue; } ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; if (start != buf) break; wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; } static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); return 0; } static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; group = file->private_data; p = (void __user *) arg; pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd); switch (cmd) { case FIONREAD: spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; #ifdef CONFIG_CHECKPOINT_RESTORE case INOTIFY_IOC_SETNEXTWD: ret = -EINVAL; if (arg >= 1 && arg <= INT_MAX) { struct inotify_group_private_data *data; data = &group->inotify_data; spin_lock(&data->idr_lock); idr_set_cursor(&data->idr, (unsigned int)arg); spin_unlock(&data->idr_lock); ret = 0; } break; #endif /* CONFIG_CHECKPOINT_RESTORE */ } return ret; } static const struct file_operations inotify_fops = { .show_fdinfo = inotify_show_fdinfo, .poll = inotify_poll, .read = inotify_read, .fasync = fsnotify_fasync, .release = inotify_release, .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, .llseek = noop_llseek, }; /* * find_inode - resolve a user-given path to a specific inode */ static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned int flags, __u64 mask) { int error; error = user_path_at(AT_FDCWD, dirname, flags, path); if (error) return error; /* you can only watch an inode if you have read permissions on it */ error = path_permission(path, MAY_READ); if (error) { path_put(path); return error; } error = security_path_notify(path, mask, FSNOTIFY_OBJ_TYPE_INODE); if (error) path_put(path); return error; } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, struct inotify_inode_mark *i_mark) { int ret; idr_preload(GFP_KERNEL); spin_lock(idr_lock); ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; fsnotify_get_mark(&i_mark->fsn_mark); } spin_unlock(idr_lock); idr_preload_end(); return ret < 0 ? ret : 0; } static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, int wd) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *i_mark; assert_spin_locked(idr_lock); i_mark = idr_find(idr, wd); if (i_mark) { struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark; fsnotify_get_mark(fsn_mark); /* One ref for being in the idr, one ref we just took */ BUG_ON(refcount_read(&fsn_mark->refcnt) < 2); } return i_mark; } static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, int wd) { struct inotify_inode_mark *i_mark; spinlock_t *idr_lock = &group->inotify_data.idr_lock; spin_lock(idr_lock); i_mark = inotify_idr_find_locked(group, wd); spin_unlock(idr_lock); return i_mark; } /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. */ static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; spin_lock(idr_lock); wd = i_mark->wd; /* * does this i_mark think it is in the idr? we shouldn't get called * if it wasn't.... */ if (wd == -1) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* * We found an mark in the idr at the right wd, but it's * not the mark we were told to remove. eparis seriously * fucked up somewhere. */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " "found_i_mark=%p found_i_mark->wd=%d " "found_i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, found_i_mark, found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr * one ref grabbed by inotify_idr_find */ if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } idr_remove(idr, wd); /* Removed from the idr, drop that ref. */ fsnotify_put_mark(&i_mark->fsn_mark); out: i_mark->wd = -1; spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); } /* * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; /* Queue ignore event for the watch */ inotify_handle_inode_event(fsn_mark, FS_IN_IGNORED, NULL, NULL, NULL, 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); dec_inotify_watches(group->inotify_data.ucounts); } static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; __u32 old_mask, new_mask; int replace = !(arg & IN_MASK_ADD); int create = (arg & IN_MASK_CREATE); int ret; fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { ret = -EEXIST; goto out; } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); old_mask = fsn_mark->mask; if (replace) { fsn_mark->mask = 0; fsn_mark->flags &= ~INOTIFY_MARK_FLAGS; } fsn_mark->mask |= inotify_arg_to_mask(inode, arg); fsn_mark->flags |= inotify_arg_to_flags(arg); new_mask = fsn_mark->mask; spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { /* more bits in old than in new? */ int dropped = (old_mask & ~new_mask); /* more bits in this fsn_mark than the inode's mask? */ int do_inode = (new_mask & ~inode->i_fsnotify_mask); /* update the inode with this new fsn_mark */ if (dropped || do_inode) fsnotify_recalc_mask(inode->i_fsnotify_marks); } /* return the wd */ ret = i_mark->wd; out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); return ret; } static int inotify_new_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct inotify_inode_mark *tmp_i_mark; int ret; struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) return -ENOMEM; fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg); tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg); tmp_i_mark->wd = -1; ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; /* increment the number of watches the user has */ if (!inc_inotify_watches(group->inotify_data.ucounts)) { inotify_remove_from_idr(group, tmp_i_mark); ret = -ENOSPC; goto out_err; } /* we are on the idr, now get on the inode */ ret = fsnotify_add_inode_mark_locked(&tmp_i_mark->fsn_mark, inode, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); goto out_err; } /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; out_err: /* match the ref from fsnotify_init_mark() */ fsnotify_put_mark(&tmp_i_mark->fsn_mark); return ret; } static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { int ret = 0; fsnotify_group_lock(group); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); fsnotify_group_unlock(group); return ret; } static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops, FSNOTIFY_GROUP_USER); if (IS_ERR(group)) return group; oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT); if (unlikely(!oevent)) { fsnotify_destroy_group(group); return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; fsnotify_init_event(group->overflow_event); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; oevent->name_len = 0; group->max_events = max_events; group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); group->inotify_data.ucounts = inc_ucount(current_user_ns(), current_euid(), UCOUNT_INOTIFY_INSTANCES); if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } return group; } /* inotify syscalls */ static int do_inotify_init(int flags) { struct fsnotify_group *group; int ret; /* Check the IN_* constants for consistency. */ BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) return -EINVAL; /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ group = inotify_new_group(inotify_max_queued_events); if (IS_ERR(group)) return PTR_ERR(group); ret = anon_inode_getfd("inotify", &inotify_fops, group, O_RDONLY | flags); if (ret < 0) fsnotify_destroy_group(group); return ret; } SYSCALL_DEFINE1(inotify_init1, int, flags) { return do_inotify_init(flags); } SYSCALL_DEFINE0(inotify_init) { return do_inotify_init(0); } SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { struct fsnotify_group *group; struct inode *inode; struct path path; struct fd f; int ret; unsigned flags = 0; /* * We share a lot of code with fs/dnotify. We also share * the bit layout between inotify's IN_* and the fsnotify * FS_*. This check ensures that only the inotify IN_* * bits get passed in and set in watches/events. */ if (unlikely(mask & ~ALL_INOTIFY_BITS)) return -EINVAL; /* * Require at least one valid bit set in the mask. * Without _something_ set, we would have no events to * watch for. */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { ret = -EINVAL; goto fput_and_out; } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { ret = -EINVAL; goto fput_and_out; } if (!(mask & IN_DONT_FOLLOW)) flags |= LOOKUP_FOLLOW; if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; ret = inotify_find_inode(pathname, &path, flags, (mask & IN_ALL_EVENTS)); if (ret) goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); fput_and_out: fdput(f); return ret; } SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; struct fd f; int ret = -EINVAL; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) goto out; group = f.file->private_data; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) goto out; ret = 0; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); out: fdput(f); return ret; } /* * inotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ static int __init inotify_user_setup(void) { unsigned long watches_max; struct sysinfo si; si_meminfo(&si); /* * Allow up to 1% of addressable memory to be allocated for inotify * watches (per user) limited to the range [8192, 1048576]. */ watches_max = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / INOTIFY_WATCH_COST; watches_max = clamp(watches_max, 8192UL, 1048576UL); BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(IN_OPEN != FS_OPEN); BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); BUILD_BUG_ON(IN_CREATE != FS_CREATE); BUILD_BUG_ON(IN_DELETE != FS_DELETE); BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; inotify_sysctls_init(); return 0; } fs_initcall(inotify_user_setup);
1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 18 18 18 3 1 1 3 5 4 3 4 3 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" #include "poll.h" #include "timeout.h" #include "waitid.h" #include "futex.h" #include "cancel.h" struct io_cancel { struct file *file; u64 addr; u32 flags; s32 fd; u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) /* * Returns true if the request matches the criteria outlined by 'cd'. */ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) { bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; if (req->ctx != cd->ctx) return false; if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) match_user_data = true; if (cd->flags & IORING_ASYNC_CANCEL_ANY) goto check_seq; if (cd->flags & IORING_ASYNC_CANCEL_FD) { if (req->file != cd->file) return false; } if (cd->flags & IORING_ASYNC_CANCEL_OP) { if (req->opcode != cd->opcode) return false; } if (match_user_data && req->cqe.user_data != cd->data) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: if (io_cancel_match_sequence(req, cd->seq)) return false; } return true; } static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; return io_cancel_req_match(req, cd); } static int io_async_cancel_one(struct io_uring_task *tctx, struct io_cancel_data *cd) { enum io_wq_cancel cancel_ret; int ret = 0; bool all; if (!tctx || !tctx->io_wq) return -ENOENT; all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; break; case IO_WQ_CANCEL_RUNNING: ret = -EALREADY; break; case IO_WQ_CANCEL_NOTFOUND: ret = -ENOENT; break; } return ret; } int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { struct io_ring_ctx *ctx = cd->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. */ if (!ret) return 0; ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_waitid_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_futex_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); return ret; } int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); cancel->flags = READ_ONCE(sqe->cancel_flags); if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; if (cancel->flags & IORING_ASYNC_CANCEL_FD) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->opcode = READ_ONCE(sqe->len); } return 0; } static int __io_async_cancel(struct io_cancel_data *cd, struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; int ret, nr = 0; do { ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) return ret; nr++; } while (1); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; nr++; } } io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); struct io_cancel_data cd = { .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->task->io_uring; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE || cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); } else { req->file = io_file_get_normal(req, cancel->fd); } if (!req->file) { ret = -EBADF; goto done; } cd.file = req->file; } ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } void init_hash_table(struct io_hash_table *table, unsigned size) { unsigned int i; for (i = 0; i < size; i++) { spin_lock_init(&table->hbs[i].lock); INIT_HLIST_HEAD(&table->hbs[i].list); } } static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { struct io_ring_ctx *ctx = cd->ctx; /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } return __io_async_cancel(cd, tctx, 0); } int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_cancel_data cd = { .ctx = ctx, .seq = atomic_inc_return(&ctx->cancel_seq), }; ktime_t timeout = KTIME_MAX; struct io_uring_sync_cancel_reg sc; struct file *file = NULL; DEFINE_WAIT(wait); int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad); i++) if (sc.pad[i]) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) if (sc.pad2[i]) return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { file = fget(sc.fd); if (!file) return -EBADF; cd.file = file; } ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); /* found something, done! */ if (ret != -EALREADY) goto out; if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { struct timespec64 ts = { .tv_sec = sc.timeout.tv_sec, .tv_nsec = sc.timeout.tv_nsec }; timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } /* * Keep looking until we get -ENOENT. we'll get woken everytime * every time a request completes and will retry the cancelation. */ do { cd.seq = atomic_inc_return(&ctx->cancel_seq); prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; ret = io_run_task_work_sig(ctx); if (ret < 0) break; ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); if (!ret) { ret = -ETIME; break; } mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; out: if (file) fput(file); return ret; }
1 1 1 2 2 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/ts_kmp.c Knuth-Morris-Pratt text search implementation * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * Implements a linear-time string-matching algorithm due to Knuth, * Morris, and Pratt [1]. Their algorithm avoids the explicit * computation of the transition function DELTA altogether. Its * matching time is O(n), for n being length(text), using just an * auxiliary function PI[1..m], for m being length(pattern), * precomputed from the pattern in time O(m). The array PI allows * the transition function DELTA to be computed efficiently * "on the fly" as needed. Roughly speaking, for any state * "q" = 0,1,...,m and any character "a" in SIGMA, the value * PI["q"] contains the information that is independent of "a" and * is needed to compute DELTA("q", "a") [2]. Since the array PI * has only m entries, whereas DELTA has O(m|SIGMA|) entries, we * save a factor of |SIGMA| in the preprocessing time by computing * PI rather than DELTA. * * [1] Cormen, Leiserson, Rivest, Stein * Introdcution to Algorithms, 2nd Edition, MIT Press * [2] See finite automaton theory */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/textsearch.h> struct ts_kmp { u8 * pattern; unsigned int pattern_len; unsigned int prefix_tbl[]; }; static unsigned int kmp_find(struct ts_config *conf, struct ts_state *state) { struct ts_kmp *kmp = ts_config_priv(conf); unsigned int i, q = 0, text_len, consumed = state->offset; const u8 *text; const int icase = conf->flags & TS_IGNORECASE; for (;;) { text_len = conf->get_next_block(consumed, &text, conf, state); if (unlikely(text_len == 0)) break; for (i = 0; i < text_len; i++) { while (q > 0 && kmp->pattern[q] != (icase ? toupper(text[i]) : text[i])) q = kmp->prefix_tbl[q - 1]; if (kmp->pattern[q] == (icase ? toupper(text[i]) : text[i])) q++; if (unlikely(q == kmp->pattern_len)) { state->offset = consumed + i + 1; return state->offset - kmp->pattern_len; } } consumed += text_len; } return UINT_MAX; } static inline void compute_prefix_tbl(const u8 *pattern, unsigned int len, unsigned int *prefix_tbl, int flags) { unsigned int k, q; const u8 icase = flags & TS_IGNORECASE; for (k = 0, q = 1; q < len; q++) { while (k > 0 && (icase ? toupper(pattern[k]) : pattern[k]) != (icase ? toupper(pattern[q]) : pattern[q])) k = prefix_tbl[k-1]; if ((icase ? toupper(pattern[k]) : pattern[k]) == (icase ? toupper(pattern[q]) : pattern[q])) k++; prefix_tbl[q] = k; } } static struct ts_config *kmp_init(const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { struct ts_config *conf; struct ts_kmp *kmp; int i; unsigned int prefix_tbl_len = len * sizeof(unsigned int); size_t priv_size = sizeof(*kmp) + len + prefix_tbl_len; conf = alloc_ts_config(priv_size, gfp_mask); if (IS_ERR(conf)) return conf; conf->flags = flags; kmp = ts_config_priv(conf); kmp->pattern_len = len; compute_prefix_tbl(pattern, len, kmp->prefix_tbl, flags); kmp->pattern = (u8 *) kmp->prefix_tbl + prefix_tbl_len; if (flags & TS_IGNORECASE) for (i = 0; i < len; i++) kmp->pattern[i] = toupper(((u8 *)pattern)[i]); else memcpy(kmp->pattern, pattern, len); return conf; } static void *kmp_get_pattern(struct ts_config *conf) { struct ts_kmp *kmp = ts_config_priv(conf); return kmp->pattern; } static unsigned int kmp_get_pattern_len(struct ts_config *conf) { struct ts_kmp *kmp = ts_config_priv(conf); return kmp->pattern_len; } static struct ts_ops kmp_ops = { .name = "kmp", .find = kmp_find, .init = kmp_init, .get_pattern = kmp_get_pattern, .get_pattern_len = kmp_get_pattern_len, .owner = THIS_MODULE, .list = LIST_HEAD_INIT(kmp_ops.list) }; static int __init init_kmp(void) { return textsearch_register(&kmp_ops); } static void __exit exit_kmp(void) { textsearch_unregister(&kmp_ops); } MODULE_LICENSE("GPL"); module_init(init_kmp); module_exit(exit_kmp);
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ void reqsk_queue_alloc(struct request_sock_queue *queue) { queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. A corner * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
24 26 26 26 24 26 3 1 1 1 1 3 2 3 1 2 2 2 3 2 2 1 1 1 1 1 3 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "refs.h" #include "cancel.h" #include "timeout.h" struct io_timeout { struct file *file; u32 off; u32 target_seq; u32 repeats; struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; /* for linked completions */ struct io_kiocb *prev; }; struct io_timeout_rem { struct file *file; u64 addr; /* timeout update */ struct timespec64 ts; u32 flags; bool ltimeout; }; static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT; } static inline void io_put_req(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { io_queue_next(req); io_free_req(req); } } static inline bool io_timeout_finish(struct io_timeout *timeout, struct io_timeout_data *data) { if (!(data->flags & IORING_TIMEOUT_MULTISHOT)) return true; if (!timeout->off || (timeout->repeats && --timeout->repeats)) return false; return true; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); return; } } io_req_task_complete(req, ts); } static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); if (status) req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&timeout->list); io_req_queue_tw_complete(req, status); return true; } return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { u32 seq; struct io_timeout *timeout, *tmp; spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); u32 events_needed, events_got; if (io_is_timeout_noseq(req)) break; /* * Since seq can easily wrap around over time, subtract * the last seq at which timeouts were flushed before comparing. * Assuming not more than 2^31-1 events have happened since, * these subtractions won't have wrapped, so we can check if * target is in [last_seq, current_seq] by comparing the two. */ events_needed = timeout->target_seq - ctx->cq_last_tm_flush; events_got = seq - ctx->cq_last_tm_flush; if (events_got < events_needed) break; io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) { io_tw_lock(link->ctx, ts); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; if (link->flags & REQ_F_FAIL) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); io_req_task_complete(link, ts); link = nxt; } } static void io_fail_links(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *link = req->link; bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; if (!link) return; while (link) { if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; trace_io_uring_fail_link(req, link); link = link->link; } link = req->link; link->io_task_work.func = io_req_tw_fail_links; io_req_task_work_add(link); req->link = NULL; } static inline void io_remove_next_linked(struct io_kiocb *req) { struct io_kiocb *nxt = req->link; req->link = nxt->link; nxt->link = NULL; } void io_disarm_next(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *link = NULL; if (req->flags & REQ_F_ARM_LTIMEOUT) { link = req->link; req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); io_req_queue_tw_complete(link, -ECANCELED); } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) io_fail_links(req); } struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = link->async_data; struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); return link; } return NULL; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *req = data->req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); io_req_set_res(req, -ETIME, 0); req->io_task_work.func = io_timeout_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { struct io_timeout *timeout; struct io_timeout_data *io; struct io_kiocb *req = NULL; list_for_each_entry(timeout, &ctx->timeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); if (io_cancel_req_match(tmp, cd)) { req = tmp; break; } } if (!req) return ERR_PTR(-ENOENT); io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return ERR_PTR(-EALREADY); timeout = io_kiocb_to_cmd(req, struct io_timeout); list_del_init(&timeout->list); return req; } int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { struct io_kiocb *req; spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); io_req_task_queue_fail(req, -ECANCELED); return 0; } static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; if (prev) { if (!(req->task->flags & PF_EXITING)) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, }; ret = io_try_cancel(req->task->io_uring, &cd, 0); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); io_req_task_complete(req, ts); } } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *prev, *req = data->req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; /* * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ if (prev) { io_remove_next_linked(prev); if (!req_ref_inc_not_zero(prev)) prev = NULL; } list_del(&timeout->list); timeout->prev = prev; spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); return HRTIMER_NORESTART; } static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { case IORING_TIMEOUT_BOOTTIME: return CLOCK_BOOTTIME; case IORING_TIMEOUT_REALTIME: return CLOCK_REALTIME; default: /* can't happen, vetted at prep time */ WARN_ON_ONCE(1); fallthrough; case 0: return CLOCK_MONOTONIC; } } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; struct io_timeout *timeout; struct io_kiocb *req = NULL; list_for_each_entry(timeout, &ctx->ltimeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); if (user_data == tmp->cqe.user_data) { req = tmp; break; } } if (!req) return -ENOENT; io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); io->timer.function = io_link_timeout_fn; hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; if (IS_ERR(req)) return PTR_ERR(req); timeout->off = 0; /* noseq */ data = req->async_data; list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; } int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; } return 0; } static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) { return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; } /* * Remove or update an existing timeout command */ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); struct io_ring_ctx *ctx = req->ctx; int ret; if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; spin_lock(&ctx->completion_lock); ret = io_timeout_cancel(ctx, &cd); spin_unlock(&ctx->completion_lock); } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } static int __io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | IORING_TIMEOUT_MULTISHOT)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; /* multishot requests only make sense with rel values */ if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS))) return -EINVAL; INIT_LIST_HEAD(&timeout->list); timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr */ timeout->repeats = 0; if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0) timeout->repeats = off; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -ENOMEM; data = req->async_data; data->req = req; data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; if (!link->head) return -EINVAL; if (link->last->opcode == IORING_OP_LINK_TIMEOUT) return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; } return 0; } int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_timeout_prep(req, sqe, false); } int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_timeout_prep(req, sqe, true); } int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data = req->async_data; struct list_head *entry; u32 tail, off = timeout->off; spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. * This is safe because ->completion_lock is held, and submissions * and completions are never mixed in the same ->completion_lock section. */ ctx->cq_last_tm_flush = tail; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nextt->target_seq - tail) break; } add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } void io_queue_linked_timeout(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer */ if (timeout->head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *req; if (task && head->task != task) return false; if (cancel_all) return true; io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; } return false; } /* Returns true if we found and killed one or more timeouts */ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) { struct io_timeout *timeout, *tmp; int canceled = 0; /* * completion_lock is needed for io_match_task(). Take it before * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); if (io_match_task(req, tsk, cancel_all) && io_kill_timeout(req, -ECANCELED)) canceled++; } spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; }
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 // SPDX-License-Identifier: GPL-2.0-only /* * vDPA bus. * * Copyright (c) 2020, Red Hat. All rights reserved. * Author: Jason Wang <jasowang@redhat.com> * */ #include <linux/module.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/vdpa.h> #include <uapi/linux/vdpa.h> #include <net/genetlink.h> #include <linux/mod_devicetable.h> #include <linux/virtio_ids.h> static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); static struct genl_family vdpa_nl_family; static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); const struct vdpa_config_ops *ops = vdev->config; u32 max_num, min_num = 1; int ret = 0; d->dma_mask = &d->coherent_dma_mask; ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); if (ret) return ret; max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); if (max_num < min_num) return -EINVAL; if (drv && drv->probe) ret = drv->probe(vdev); return ret; } static void vdpa_dev_remove(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); if (drv && drv->remove) drv->remove(vdev); } static int vdpa_dev_match(struct device *dev, struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); /* Check override first, and if set, only use the named driver */ if (vdev->driver_override) return strcmp(vdev->driver_override, drv->name) == 0; /* Currently devices must be supported by all vDPA bus drivers */ return 1; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct vdpa_device *vdev = dev_to_vdpa(dev); int ret; ret = driver_set_override(dev, &vdev->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct vdpa_device *vdev = dev_to_vdpa(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", vdev->driver_override); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct attribute *vdpa_dev_attrs[] = { &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group vdpa_dev_group = { .attrs = vdpa_dev_attrs, }; __ATTRIBUTE_GROUPS(vdpa_dev); static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, .probe = vdpa_dev_probe, .remove = vdpa_dev_remove, }; static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); const struct vdpa_config_ops *ops = vdev->config; if (ops->free) ops->free(vdev); ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } /** * __vdpa_alloc_device - allocate and initilaize a vDPA device * This allows driver to some prepartion after device is * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * * Return: Returns an error when parent/config/dma_dev is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; if (!config) goto err; if (!!config->dma_map != !!config->dma_unmap) goto err; /* It should only work for the device that use on-chip IOMMU */ if (use_va && !(config->dma_map || config->set_map)) goto err; err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) goto err; err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; vdev->dev.bus = &vdpa_bus; vdev->dev.parent = parent; vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); else err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; err_name: ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); static int vdpa_name_match(struct device *dev, const void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); return (strcmp(dev_name(&vdev->dev), data) == 0); } static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { struct device *dev; vdev->nvqs = nvqs; lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); return -EEXIST; } return device_add(&vdev->dev); } /** * _vdpa_register_device - register a vDPA device with vdpa lock held * Caller must have a succeed call of vdpa_alloc_device() before. * Caller must invoke this routine in the management device dev_add() * callback after setting up valid mgmtdev for this vdpa device. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { if (!vdev->mdev) return -EINVAL; return __vdpa_register_device(vdev, nvqs); } EXPORT_SYMBOL_GPL(_vdpa_register_device); /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); /** * _vdpa_unregister_device - unregister a vDPA device * Caller must invoke this routine as part of management device dev_del() * callback. * @vdev: the vdpa device to be unregisted from vDPA bus */ void _vdpa_unregister_device(struct vdpa_device *vdev) { lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL_GPL(_vdpa_unregister_device); /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus */ void vdpa_unregister_device(struct vdpa_device *vdev) { down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); /** * __vdpa_register_driver - register a vDPA device driver * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { drv->driver.bus = &vdpa_bus; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__vdpa_register_driver); /** * vdpa_unregister_driver - unregister a vDPA device driver * @drv: the vdpa device driver to be unregistered */ void vdpa_unregister_driver(struct vdpa_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); /** * vdpa_mgmtdev_register - register a vdpa management device * * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. * Return: Returns 0 on success or failure when required callback ops are not * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) return -EINVAL; INIT_LIST_HEAD(&mdev->list); down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); static int vdpa_match_remove(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_mgmt_dev *mdev = vdev->mdev; if (mdev == data) mdev->ops->dev_del(mdev, vdev); return 0; } void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); static void vdpa_get_config_unlocked(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } /** * vdpa_get_config - Get one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read to * @len: length of the configuration fields in bytes */ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); /** * vdpa_set_config - Set one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read from * @length: length of the configuration fields in bytes */ void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, const char *busname, const char *devname) { /* Bus name is optional for simulated management device, so ignore the * device with bus if bus attribute is provided. */ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) return false; if (!busname && strcmp(dev_name(mdev->device), devname) == 0) return true; if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && (strcmp(dev_name(mdev->device), devname) == 0)) return true; return false; } static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) { struct vdpa_mgmt_dev *mdev; const char *busname = NULL; const char *devname; if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) return ERR_PTR(-EINVAL); devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); list_for_each_entry(mdev, &mdev_head, list) { if (mgmtdev_handle_match(mdev, busname, devname)) return mdev; } return ERR_PTR(-ENODEV); } static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) { if (mdev->device->bus && nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) return -EMSGSIZE; return 0; } static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, unsigned int *nclasses) { u64 supported_classes = 0; unsigned int n = 0; for (int i = 0; mdev->id_table[i].device; i++) { if (mdev->id_table[i].device > 63) continue; supported_classes |= BIT_ULL(mdev->id_table[i].device); n++; } if (nclasses) *nclasses = n; return supported_classes; } static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); if (err) goto msg_err; if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, vdpa_mgmtdev_get_classes(mdev, NULL), VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, mdev->max_supported_vqs)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, mdev->supported_features, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); return err; out: nlmsg_free(msg); return err; } static int vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_mgmt_dev *mdev; int start = cb->args[0]; int idx = 0; int err; down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; continue; } err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) goto out; idx++; } out: up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } #define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) /* * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for * all 64bit features. If the features are extended beyond 64 bits, or new * "holes" are reserved for other type of features than per-device, this * macro would have to be updated. */ #define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; unsigned int ncls = 0; const u8 *macaddr; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { config.net.mtu = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { config.net.max_vq_pairs = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); if (!config.net.max_vq_pairs) { NL_SET_ERR_MSG_MOD(info->extack, "At least one pair of VQs is required"); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { u64 missing = 0x0ULL; config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) missing |= BIT_ULL(VIRTIO_NET_F_MAC); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) missing |= BIT_ULL(VIRTIO_NET_F_MTU); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && config.net.max_vq_pairs > 1 && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) missing |= BIT_ULL(VIRTIO_NET_F_MQ); if (missing) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Missing features 0x%llx for provided attributes", missing); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } /* Skip checking capability if user didn't prefer to configure any * device networking attributes. It is likely that user might have used * a device specific method to configure such attributes or using device * default attributes. */ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); err = PTR_ERR(mdev); goto err; } if ((config.mask & mdev->config_attr_mask) != config.mask) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Some provided attributes are not supported: 0x%llx", config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } classes = vdpa_mgmtdev_get_classes(mdev, &ncls); if (config.mask & VDPA_DEV_NET_ATTRS_MASK && !(classes & BIT_ULL(VIRTIO_ID_NET))) { NL_SET_ERR_MSG_MOD(info->extack, "Network class attributes provided on unsupported management device"); err = -EINVAL; goto err; } if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && config.device_features & VIRTIO_DEVICE_F_MASK) { NL_SET_ERR_MSG_MOD(info->extack, "Management device supports multi-class while device features specified are ambiguous"); err = -EINVAL; goto err; } err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); return err; } static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct vdpa_device *vdev; struct device *dev; const char *name; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); err = -EINVAL; goto mdev_err; } mdev = vdev->mdev; mdev->ops->dev_del(mdev, vdev); mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u16 max_vq_size; u16 min_vq_size = 1; u32 device_id; u32 vendor_id; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); if (err) goto msg_err; device_id = vdev->config->get_device_id(vdev); vendor_id = vdev->config->get_vendor_id(vdev); max_vq_size = vdev->config->get_vq_num_max(vdev); if (vdev->config->get_vq_num_min) min_vq_size = vdev->config->get_vq_num_min(vdev); err = -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) goto msg_err; genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { err = -EINVAL; goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } struct vdpa_dev_dump_info { struct sk_buff *msg; struct netlink_callback *cb; int start_idx; int idx; }; static int vdpa_dev_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); } static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->mtu); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); } static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) return 0; return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config->mac), config->mac); } static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->status); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); } static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) return -EMSGSIZE; return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, const struct virtio_blk_config *config) { u64 val_u64; val_u64 = __virtio64_to_cpu(true, config->capacity); return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, val_u64, VDPA_ATTR_PAD); } static int vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->size_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, val_u32); } /* fill the block size*/ static int vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->blk_size); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); } static int vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->seg_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); } static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->num_queues); return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); } static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 min_io_size; u32 opt_io_size; if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) return 0; min_io_size = __virtio16_to_cpu(true, config->min_io_size); opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, config->physical_block_exp)) return -EMSGSIZE; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, config->alignment_offset)) return -EMSGSIZE; if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) return -EMSGSIZE; if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) { u8 ro; ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_READ_ONLY, ro)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) { u8 flush; flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_FLUSH, flush)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_blk_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_blk_capacity_config_fill(msg, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_ro_config_fill(msg, features_device)) return -EMSGSIZE; if (vdpa_dev_blk_flush_config_fill(msg, features_device)) return -EMSGSIZE; return 0; } static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u64 features_driver; u8 status = 0; u32 device_id; void *hdr; int err; down_read(&vdev->cf_lock); hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { err = -EMSGSIZE; goto out; } if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto msg_err; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto msg_err; } /* only read driver features after the feature negotiation is done */ status = vdev->config->get_status(vdev); if (status & VIRTIO_CONFIG_S_FEATURES_OK) { features_driver = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } } switch (device_id) { case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; case VIRTIO_ID_BLOCK: err = vdpa_dev_blk_config_fill(vdev, msg); break; default: err = -EOPNOTSUPP; break; } if (err) goto msg_err; up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: up_read(&vdev->cf_lock); return err; } static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { struct virtio_net_config config = {}; u64 features; u8 status; int err; status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); return -EAGAIN; } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; err = vdpa_dev_net_mq_config_fill(msg, features, &config); if (err) return err; if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); if (err) return err; return 0; } static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { int err; down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; } err = vdpa_fill_stats_rec(vdev, msg, info, index); out: up_read(&vdev->cf_lock); return err; } static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { u32 device_id; void *hdr; int err; u32 portid = info->snd_portid; u32 seq = info->snd_seq; u32 flags = 0; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_VSTATS_GET); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto undo_msg; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto undo_msg; } switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } err = vendor_stats_fill(vdev, msg, info, index); break; default: err = -EOPNOTSUPP; break; } genlmsg_end(msg, hdr); return err; undo_msg: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (!err) err = genlmsg_reply(msg, info); mdev_err: put_device(dev); dev_err: up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; } static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; u32 index; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); dev_err: nlmsg_free(msg); up_read(&vdpa_dev_lock); return err; } static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family vdpa_nl_family __ro_after_init = { .name = VDPA_GENL_NAME, .version = VDPA_GENL_VERSION, .maxattr = VDPA_ATTR_MAX, .policy = vdpa_nl_policy, .netnsok = false, .module = THIS_MODULE, .ops = vdpa_nl_ops, .n_ops = ARRAY_SIZE(vdpa_nl_ops), .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, }; static int vdpa_init(void) { int err; err = bus_register(&vdpa_bus); if (err) return err; err = genl_register_family(&vdpa_nl_family); if (err) goto err; return 0; err: bus_unregister(&vdpa_bus); return err; } static void __exit vdpa_exit(void) { genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); MODULE_LICENSE("GPL v2");
7 6 6 6 6 6 6 7 7 7 7 7 1 1 1 1 1 1 1 3 3 3 3 1 1 1 1 1 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct features_req_info { struct ethnl_req_info base; }; struct features_reply_data { struct ethnl_reply_data base; u32 hw[ETHTOOL_DEV_FEATURE_WORDS]; u32 wanted[ETHTOOL_DEV_FEATURE_WORDS]; u32 active[ETHTOOL_DEV_FEATURE_WORDS]; u32 nochange[ETHTOOL_DEV_FEATURE_WORDS]; u32 all[ETHTOOL_DEV_FEATURE_WORDS]; }; #define FEATURES_REPDATA(__reply_base) \ container_of(__reply_base, struct features_reply_data, base) const struct nla_policy ethnl_features_get_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) { unsigned int i; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++) dest[i] = src >> (32 * i); } static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; netdev_features_t all_features; ethnl_features_to_bitmap32(data->hw, dev->hw_features); ethnl_features_to_bitmap32(data->wanted, dev->wanted_features); ethnl_features_to_bitmap32(data->active, dev->features); ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE); all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0); ethnl_features_to_bitmap32(data->all, all_features); return 0; } static int features_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; unsigned int len = 0; int ret; ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; return len; } static int features_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE, data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); } const struct ethnl_request_ops ethnl_features_request_ops = { .request_cmd = ETHTOOL_MSG_FEATURES_GET, .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY, .hdr_attr = ETHTOOL_A_FEATURES_HEADER, .req_info_size = sizeof(struct features_req_info), .reply_data_size = sizeof(struct features_reply_data), .prepare_data = features_prepare_data, .reply_size = features_reply_size, .fill_reply = features_fill_reply, }; /* FEATURES_SET */ const struct nla_policy ethnl_features_set_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED }, }; static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val) { const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); unsigned int i; for (i = 0; i < words; i++) dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG)); } static netdev_features_t ethnl_bitmap_to_features(unsigned long *src) { const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE; const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); netdev_features_t ret = 0; unsigned int i; for (i = 0; i < words; i++) ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG); ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT); return ret; } static int features_send_reply(struct net_device *dev, struct genl_info *info, const unsigned long *wanted, const unsigned long *wanted_mask, const unsigned long *active, const unsigned long *active_mask, bool compact) { struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; reply_len = ethnl_reply_header_size(); ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_A_FEATURES_HEADER, info, &reply_payload); if (!rskb) goto err; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); return ret; nla_put_failure: nlmsg_free(rskb); WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n", reply_len); err: GENL_SET_ERR_MSG(info, "failed to send reply message"); return ret; } int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) { DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT); struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod; int ret; if (!tb[ETHTOOL_A_FEATURES_WANTED]) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEATURES_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT); if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) { dev->wanted_features &= ~dev->hw_features; dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features; __netdev_update_features(dev); } ethnl_features_to_bitmap(new_active, dev->features); mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT); ret = 0; if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS; bitmap_xor(wanted_diff_mask, req_wanted, new_active, NETDEV_FEATURE_COUNT); bitmap_xor(active_diff_mask, old_active, new_active, NETDEV_FEATURE_COUNT); bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask, NETDEV_FEATURE_COUNT); bitmap_and(req_wanted, req_wanted, wanted_diff_mask, NETDEV_FEATURE_COUNT); bitmap_and(new_active, new_active, active_diff_mask, NETDEV_FEATURE_COUNT); ret = features_send_reply(dev, info, req_wanted, wanted_diff_mask, new_active, active_diff_mask, compact); } if (mod) netdev_features_change(dev); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Asymmetric Public-key cryptography key type interface * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _KEYS_ASYMMETRIC_TYPE_H #define _KEYS_ASYMMETRIC_TYPE_H #include <linux/key-type.h> #include <linux/verification.h> extern struct key_type key_type_asymmetric; /* * The key payload is four words. The asymmetric-type key uses them as * follows: */ enum asymmetric_payload_bits { asym_crypto, /* The data representing the key */ asym_subtype, /* Pointer to an asymmetric_key_subtype struct */ asym_key_ids, /* Pointer to an asymmetric_key_ids struct */ asym_auth /* The key's authorisation (signature, parent key ID) */ }; /* * Identifiers for an asymmetric key ID. We have three ways of looking up a * key derived from an X.509 certificate: * * (1) Serial Number & Issuer. Non-optional. This is the only valid way to * map a PKCS#7 signature to an X.509 certificate. * * (2) Issuer & Subject Unique IDs. Optional. These were the original way to * match X.509 certificates, but have fallen into disuse in favour of (3). * * (3) Auth & Subject Key Identifiers. Optional. SKIDs are only provided on * CA keys that are intended to sign other keys, so don't appear in end * user certificates unless forced. * * We could also support an PGP key identifier, which is just a SHA1 sum of the * public key and certain parameters, but since we don't support PGP keys at * the moment, we shall ignore those. * * What we actually do is provide a place where binary identifiers can be * stashed and then compare against them when checking for an id match. */ struct asymmetric_key_id { unsigned short len; unsigned char data[]; }; struct asymmetric_key_ids { void *id[3]; }; extern bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2); extern bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2); extern struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1, size_t len_1, const void *val_2, size_t len_2); static inline const struct asymmetric_key_ids *asymmetric_key_ids(const struct key *key) { return key->payload.data[asym_key_ids]; } static inline const struct public_key *asymmetric_key_public_key(const struct key *key) { return key->payload.data[asym_crypto]; } extern struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_0, const struct asymmetric_key_id *id_1, const struct asymmetric_key_id *id_2, bool partial); int x509_load_certificate_list(const u8 cert_list[], const unsigned long list_size, const struct key *keyring); /* * The payload is at the discretion of the subtype. */ #endif /* _KEYS_ASYMMETRIC_TYPE_H */
1843 1821 1824 1826 1481 1493 1476 1691 1689 1673 1665 114 1676 1273 1271 1271 1260 119 1258 1485 1475 1479 5 4 5 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; long delta; if (!c->parent) return; protected = min(usage, READ_ONCE(c->min)); old_protected = atomic_long_read(&c->min_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } protected = min(usage, READ_ONCE(c->low)); old_protected = atomic_long_read(&c->low_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) { new = 0; atomic_long_set(&counter->usage, new); } propagate_protected_usage(counter, new); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ data_race(c->failcnt++); *fail = c; goto failed; } propagate_protected_usage(c, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); obj->ops->eval(obj, regs, pkt); } static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_object *obj = nft_objref_priv(expr); u8 genmask = nft_genmask_next(ctx->net); u32 objtype; if (!tb[NFTA_OBJREF_IMM_NAME] || !tb[NFTA_OBJREF_IMM_TYPE]) return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); obj = nft_obj_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, genmask); if (IS_ERR(obj)) return -ENOENT; if (!nft_use_inc(&obj->use)) return -EMFILE; nft_objref_priv(expr) = obj; return 0; } static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->ops->type->type))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_object *obj = nft_objref_priv(expr); if (phase == NFT_TRANS_COMMIT) return; nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_object *obj = nft_objref_priv(expr); nft_use_inc_restore(&obj->use); } static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), .eval = nft_objref_eval, .init = nft_objref_init, .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_objref_map { struct nft_set *set; u8 sreg; struct nft_set_binding binding; }; void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; bool found; found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); } static int nft_objref_map_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_objref_map *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; int err; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], tb[NFTA_OBJREF_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; priv->binding.flags = set->flags & NFT_SET_OBJECT; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) || nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_map_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_objref_map_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, .activate = nft_objref_map_activate, .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_objref_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_OBJREF_SET_SREG] && (tb[NFTA_OBJREF_SET_NAME] || tb[NFTA_OBJREF_SET_ID])) return &nft_objref_map_ops; else if (tb[NFTA_OBJREF_IMM_NAME] && tb[NFTA_OBJREF_IMM_TYPE]) return &nft_objref_ops; return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, };
413 963 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #ifndef __LINUX_PAGE_TABLE_CHECK_H #define __LINUX_PAGE_TABLE_CHECK_H #ifdef CONFIG_PAGE_TABLE_CHECK #include <linux/jump_label.h> extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_free(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear_range(mm, addr, pmd); } #else static inline void page_table_check_alloc(struct page *page, unsigned int order) { } static inline void page_table_check_free(struct page *page, unsigned int order) { } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { } #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */
239 53 50 1 70 1 52 1 31 2 2 34 42 9 2 2 8 835 4 34 1033 239 478 1130 856 830 846 18 239 238 239 3 5 1 5 5 5 4 5 5 5 26 25 28 40 1 1 1 17 2 2 2 15 12 24 9 178 503 224 224 194 12 21 25 24 16 24 4 24 25 12 13 9 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_NOTIFY_H #define _LINUX_FS_NOTIFY_H /* * include/linux/fsnotify.h - generic hooks for filesystem notification, to * reduce in-source duplication from both dnotify and inotify. * * We don't compile any of this away in some complicated menagerie of ifdefs. * Instead, we rely on the code inside to optimize away as needed. * * (C) Copyright 2005 Robert Love */ #include <linux/fsnotify_backend.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/bug.h> /* Are there any inode/mount/sb objects watched with priority prio or above? */ static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb, int prio) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return false; return atomic_long_read(&sbinfo->watched_objects[prio]); } /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { return fsnotify_sb_has_priority_watchers(sb, 0); } /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may * have changed (i.e. renamed over). * * Unlike fsnotify_parent(), the event will be reported regardless of the * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) { if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0); } /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { struct inode *inode = d_inode(dentry); if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { mask |= FS_ISDIR; /* sb/mount marks are not interested in name of directory */ if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) goto notify_child; } /* disconnected dentry cannot notify parent */ if (IS_ROOT(dentry)) goto notify_child; return __fsnotify_parent(dentry, mask, data, data_type); notify_child: return fsnotify(mask, data, data_type, NULL, NULL, inode, 0); } /* * Simple wrappers to consolidate calls to fsnotify_parent() when an event * is on a file/dentry. */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_file(struct file *file, __u32 mask) { const struct path *path; if (file->f_mode & FMODE_NONOTIFY) return 0; path = &file->f_path; /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ if (mask & ALL_FSNOTIFY_PERM_EVENTS && !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, FSNOTIFY_PRIO_CONTENT)) return 0; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { __u32 fsnotify_mask = FS_ACCESS_PERM; /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection * must not be held. */ lockdep_assert_once(file_write_not_started(file)); if (!(perm_mask & MAY_READ)) return 0; return fsnotify_file(file, fsnotify_mask); } /* * fsnotify_file_perm - permission hook before file access */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return fsnotify_file_area_perm(file, perm_mask, NULL, 0); } /* * fsnotify_open_perm - permission hook before file open */ static inline int fsnotify_open_perm(struct file *file) { int ret; if (file->f_flags & __FMODE_EXEC) { ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); if (ret) return ret; } return fsnotify_file(file, FS_OPEN_PERM); } #else static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { return 0; } static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; } static inline int fsnotify_open_perm(struct file *file) { return 0; } #endif /* * fsnotify_link_count - inode's link count changed */ static inline void fsnotify_link_count(struct inode *inode) { fsnotify_inode(inode, FS_ATTRIB); } /* * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, const struct qstr *old_name, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; __u32 rename_mask = FS_RENAME; const struct qstr *new_name = &moved->d_name; if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; rename_mask |= FS_ISDIR; } /* Event with information about both old and new parent+name */ fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY, old_dir, old_name, 0); fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_dir, old_name, fs_cookie); fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); fsnotify_inode(source, FS_MOVE_SELF); audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); } /* * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed */ static inline void fsnotify_inode_delete(struct inode *inode) { __fsnotify_inode_delete(inode); } /* * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed */ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) { __fsnotify_vfsmount_delete(mnt); } /* * fsnotify_inoderemove - an inode is going away */ static inline void fsnotify_inoderemove(struct inode *inode) { fsnotify_inode(inode, FS_DELETE_SELF); __fsnotify_inode_delete(inode); } /* * fsnotify_create - 'name' was linked in * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory * * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) { fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, dir, &new_dentry->d_name, 0); } /* * fsnotify_delete - @dentry was unlinked and unhashed * * Caller must make sure that dentry->d_name is stable. * * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode * as this may be called after d_delete() and old_dentry may be negative. */ static inline void fsnotify_delete(struct inode *dir, struct inode *inode, struct dentry *dentry) { __u32 mask = FS_DELETE; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name, 0); } /** * d_delete_notify - delete a dentry and call fsnotify_delete() * @dentry: The dentry to delete * * This helper is used to guaranty that the unlinked inode cannot be found * by lookup of this name after fsnotify_delete() event has been delivered. */ static inline void d_delete_notify(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); ihold(inode); d_delete(dentry); fsnotify_delete(dir, inode, dentry); iput(inode); } /* * fsnotify_unlink - 'name' was unlinked * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_mkdir - directory 'name' was created * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* * fsnotify_rmdir - directory 'name' was removed * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_access - file was read */ static inline void fsnotify_access(struct file *file) { fsnotify_file(file, FS_ACCESS); } /* * fsnotify_modify - file was modified */ static inline void fsnotify_modify(struct file *file) { fsnotify_file(file, FS_MODIFY); } /* * fsnotify_open - file was opened */ static inline void fsnotify_open(struct file *file) { __u32 mask = FS_OPEN; if (file->f_flags & __FMODE_EXEC) mask |= FS_OPEN_EXEC; fsnotify_file(file, mask); } /* * fsnotify_close - file was closed */ static inline void fsnotify_close(struct file *file) { __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; fsnotify_file(file, mask); } /* * fsnotify_xattr - extended attributes were changed */ static inline void fsnotify_xattr(struct dentry *dentry) { fsnotify_dentry(dentry, FS_ATTRIB); } /* * fsnotify_change - notify_change event. file was modified and/or metadata * was changed. */ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) { __u32 mask = 0; if (ia_valid & ATTR_UID) mask |= FS_ATTRIB; if (ia_valid & ATTR_GID) mask |= FS_ATTRIB; if (ia_valid & ATTR_SIZE) mask |= FS_MODIFY; /* both times implies a utime(s) call */ if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) mask |= FS_ATTRIB; else if (ia_valid & ATTR_ATIME) mask |= FS_ACCESS; else if (ia_valid & ATTR_MTIME) mask |= FS_MODIFY; if (ia_valid & ATTR_MODE) mask |= FS_ATTRIB; if (mask) fsnotify_dentry(dentry, mask); } static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, int error) { struct fs_error_report report = { .error = error, .inode = inode, .sb = sb, }; return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, NULL, 0); } #endif /* _LINUX_FS_NOTIFY_H */
2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 // SPDX-License-Identifier: GPL-2.0-or-later /* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * Tyler Hicks <code@tyhicks.com> */ #include <linux/dcache.h> #include <linux/file.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" /* * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; module_param(ecryptfs_verbosity, int, 0); MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); /* * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); /* * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; module_param(ecryptfs_message_wait_timeout, long, 0); MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "Maximum number of seconds that an operation will " "sleep while waiting for a message response from " "userspace"); /* * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. */ unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); void __ecryptfs_printk(const char *fmt, ...) { va_list args; va_start(args, fmt); if (fmt[1] == '7') { /* KERN_DEBUG */ if (ecryptfs_verbosity >= 1) vprintk(fmt, args); } else vprintk(fmt, args); va_end(args); } /* * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set * * eCryptfs only ever keeps a single open file for every lower * inode. All I/O operations to the lower inode occur through that * file. When the first eCryptfs dentry that interposes with the first * lower dentry for that inode is created, this function creates the * lower file struct and associates it with the eCryptfs * inode. When all eCryptfs files associated with the inode are released, the * file is closed. * * The lower file will be opened with read/write permissions, if * possible. Otherwise, it is opened read-only. * * This function does nothing if a lower file is already * associated with the eCryptfs inode. * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); const struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", path->dentry, path->mnt, rc); (*lower_file) = NULL; } return rc; } int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) { struct ecryptfs_inode_info *inode_info; int count, rc = 0; inode_info = ecryptfs_inode_to_private(inode); mutex_lock(&inode_info->lower_file_mutex); count = atomic_inc_return(&inode_info->lower_file_count); if (WARN_ON_ONCE(count < 1)) rc = -EINVAL; else if (count == 1) { rc = ecryptfs_init_lower_file(dentry, &inode_info->lower_file); if (rc) atomic_set(&inode_info->lower_file_count, 0); } mutex_unlock(&inode_info->lower_file_mutex); return rc; } void ecryptfs_put_lower_file(struct inode *inode) { struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, &inode_info->lower_file_mutex)) { filemap_write_and_wait(inode->i_mapping); fput(inode_info->lower_file); inode_info->lower_file = NULL; mutex_unlock(&inode_info->lower_file_mutex); } } enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; static int ecryptfs_init_global_auth_toks( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { struct ecryptfs_global_auth_tok *global_auth_tok; struct ecryptfs_auth_tok *auth_tok; int rc = 0; list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { rc = ecryptfs_keyring_auth_tok_for_sig( &global_auth_tok->global_auth_tok_key, &auth_tok, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Could not find valid key in user " "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; goto out; } else { global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; up_write(&(global_auth_tok->global_auth_tok_key)->sem); } } out: return rc; } static void ecryptfs_init_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { memset((void *)mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; } /** * ecryptfs_parse_options * @sbi: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output * sig=XXX - description(signature) of the key to use * * Returns the dentry object of the lower-level (lower/interposed) * directory; We want to mount our stackable file system on top of * that lower directory. * * The signature of the key to use must be the description of a key * already in the keyring. Mounting will fail if the key can not be * found. * * Returns zero on success; non-zero on error */ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, uid_t *check_ruid) { char *p; int rc = 0; int sig_set = 0; int cipher_name_set = 0; int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; char *cipher_name_src; char *fn_cipher_name_src; char *fnek_src; char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; u8 cipher_code; *check_ruid = 0; if (!options) { rc = -EINVAL; goto out; } ecryptfs_init_mount_crypt_stat(mount_crypt_stat); while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case ecryptfs_opt_sig: case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); goto out; } sig_set = 1; break; case ecryptfs_opt_cipher: case ecryptfs_opt_ecryptfs_cipher: cipher_name_src = args[0].from; strscpy(mount_crypt_stat->global_default_cipher_name, cipher_name_src); cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: cipher_key_bytes_src = args[0].from; cipher_key_bytes = (int)simple_strtol(cipher_key_bytes_src, &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: mount_crypt_stat->flags |= ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; break; case ecryptfs_opt_xattr_metadata: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; break; case ecryptfs_opt_encrypted_view: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; case ecryptfs_opt_fnek_sig: fnek_src = args[0].from; strscpy(mount_crypt_stat->global_default_fnek_sig, fnek_src); rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig, ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", mount_crypt_stat->global_default_fnek_sig, rc); goto out; } mount_crypt_stat->flags |= (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); break; case ecryptfs_opt_fn_cipher: fn_cipher_name_src = args[0].from; strscpy(mount_crypt_stat->global_default_fn_cipher_name, fn_cipher_name_src); fn_cipher_name_set = 1; break; case ecryptfs_opt_fn_cipher_key_bytes: fn_cipher_key_bytes_src = args[0].from; fn_cipher_key_bytes = (int)simple_strtol(fn_cipher_key_bytes_src, &fn_cipher_key_bytes_src, 0); mount_crypt_stat->global_default_fn_cipher_key_bytes = fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; case ecryptfs_opt_mount_auth_tok_only: mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; case ecryptfs_opt_check_dev_ruid: *check_ruid = 1; break; case ecryptfs_opt_err: default: printk(KERN_WARNING "%s: eCryptfs: unrecognized option [%s]\n", __func__, p); } } if (!sig_set) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "You must supply at least one valid " "auth tok signature as a mount " "parameter; see the eCryptfs README\n"); goto out; } if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_name_set) strcpy(mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_cipher_name); if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; cipher_code = ecryptfs_code_for_cipher_string( mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, "eCryptfs doesn't support cipher: %s\n", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; } mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !ecryptfs_tfm_exists( mount_crypt_stat->global_default_fn_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); out: return rc; } struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /* * ecryptfs_mount * @fs_type: The filesystem type that the superblock should belong to * @flags: The flags associated with the mount * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); if (!sbi) { rc = -ENOMEM; goto out; } if (!dev_name) { rc = -EINVAL; err = "Device name cannot be null"; goto out; } rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; } mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; } rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; s->s_xattr = ecryptfs_xattr_handlers; s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); goto out1; } if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { rc = -EINVAL; printk(KERN_ERR "Mount on filesystem of type " "eCryptfs explicitly disallowed due to " "known incompatibilities\n"); goto out_free; } if (is_idmapped_mnt(path.mnt)) { rc = -EINVAL; printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); goto out_free; } if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } ecryptfs_set_superblock_lower(s, path.dentry->d_sb); /** * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ s->s_flags = flags & ~SB_POSIXACL; s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** * Force a read-only eCryptfs mount when: * 1) The lower mount is ro * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) s->s_flags |= SB_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; rc = -EINVAL; if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); goto out_free; } inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; s->s_root = d_make_root(inode); if (!s->s_root) { rc = -ENOMEM; goto out_free; } rc = -ENOMEM; root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!root_info) goto out_free; /* ->kill_sb() will take care of root_info */ ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; s->s_flags |= SB_ACTIVE; return dget(s->s_root); out_free: path_put(&path); out1: deactivate_locked_super(s); out: if (sbi) { ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return ERR_PTR(rc); } /** * ecryptfs_kill_block_super * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. */ static void ecryptfs_kill_block_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); kill_anon_super(sb); if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; MODULE_ALIAS_FS("ecryptfs"); /* * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created */ static void inode_info_init_once(void *vptr) { struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; inode_init_once(&ei->vfs_inode); } static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { .cache = &ecryptfs_auth_tok_list_item_cache, .name = "ecryptfs_auth_tok_list_item", .size = sizeof(struct ecryptfs_auth_tok_list_item), }, { .cache = &ecryptfs_file_info_cache, .name = "ecryptfs_file_cache", .size = sizeof(struct ecryptfs_file_info), }, { .cache = &ecryptfs_dentry_info_cache, .name = "ecryptfs_dentry_info_cache", .size = sizeof(struct ecryptfs_dentry_info), }, { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { .cache = &ecryptfs_sb_info_cache, .name = "ecryptfs_sb_cache", .size = sizeof(struct ecryptfs_sb_info), }, { .cache = &ecryptfs_header_cache, .name = "ecryptfs_headers", .size = PAGE_SIZE, }, { .cache = &ecryptfs_xattr_cache, .name = "ecryptfs_xattr_cache", .size = PAGE_SIZE, }, { .cache = &ecryptfs_key_record_cache, .name = "ecryptfs_key_record_cache", .size = sizeof(struct ecryptfs_key_record), }, { .cache = &ecryptfs_key_sig_cache, .name = "ecryptfs_key_sig_cache", .size = sizeof(struct ecryptfs_key_sig), }, { .cache = &ecryptfs_global_auth_tok_cache, .name = "ecryptfs_global_auth_tok_cache", .size = sizeof(struct ecryptfs_global_auth_tok), }, { .cache = &ecryptfs_key_tfm_cache, .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, }; static void ecryptfs_free_kmem_caches(void) { int i; /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; kmem_cache_destroy(*(info->cache)); } } /** * ecryptfs_init_kmem_caches * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_kmem_caches(void) { int i; for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; *(info->cache) = kmem_cache_create(info->name, info->size, 0, SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " "kmem_cache_create failed\n", info->name); return -ENOMEM; } } return 0; } static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); static struct attribute *attributes[] = { &version_attr.attr, NULL, }; static const struct attribute_group attr_group = { .attrs = attributes, }; static int do_sysfs_registration(void) { int rc; ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); if (!ecryptfs_kobj) { printk(KERN_ERR "Unable to create ecryptfs kset\n"); rc = -ENOMEM; goto out; } rc = sysfs_create_group(ecryptfs_kobj, &attr_group); if (rc) { printk(KERN_ERR "Unable to create ecryptfs version attributes\n"); kobject_put(ecryptfs_kobj); } out: return rc; } static void do_sysfs_unregistration(void) { sysfs_remove_group(ecryptfs_kobj, &attr_group); kobject_put(ecryptfs_kobj); } static int __init ecryptfs_init(void) { int rc; if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " "default eCryptfs extent size is [%u] bytes; " "the page size is [%lu] bytes.\n", ECRYPTFS_DEFAULT_EXTENT_SIZE, (unsigned long)PAGE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); if (rc) { printk(KERN_ERR "Failed to allocate one or more kmem_cache objects\n"); goto out; } rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); goto out_free_kmem_caches; } rc = ecryptfs_init_kthread(); if (rc) { printk(KERN_ERR "%s: kthread initialization failed; " "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occurred while attempting to " "initialize the communications channel to " "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { printk(KERN_ERR "Failure whilst attempting to init crypto; " "rc = [%d]\n", rc); goto out_release_messaging; } rc = register_filesystem(&ecryptfs_fs_type); if (rc) { printk(KERN_ERR "Failed to register filesystem\n"); goto out_destroy_crypto; } if (ecryptfs_verbosity > 0) printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " "will be written to the syslog!\n", ecryptfs_verbosity); goto out; out_destroy_crypto: ecryptfs_destroy_crypto(); out_release_messaging: ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_free_kmem_caches: ecryptfs_free_kmem_caches(); out: return rc; } static void __exit ecryptfs_exit(void) { int rc; rc = ecryptfs_destroy_crypto(); if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); } MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>"); MODULE_DESCRIPTION("eCryptfs"); MODULE_LICENSE("GPL"); module_init(ecryptfs_init) module_exit(ecryptfs_exit)
9 6 9 9 9 9 8 9 12 12 12 8 6 10 11 10 12 26 26 25 2 2 26 24 17 1 1 1 1 1 1 1 1 2 1 1 1 3 3 3 3 3 1 1 1 1 1 2 3 3 3 2 3 1 1 1 1 1 1 1 1 3 3 2 2 2 2 2 1 2 1 2 1 2 2 3 1 1 1 1 1 1 1 1 1 1 15 11 7 3 15 15 13 13 4 13 13 1 13 13 6 6 1 6 2 6 1 6 6 6 6 1 1 6 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/static_key.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> #include <net/geneve.h> #include <net/vxlan.h> #include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; EXPORT_SYMBOL(iptun_encaps); const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; EXPORT_SYMBOL(ip6tun_encaps); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; struct iphdr *iph; int err; skb_scrub_packet(skb, xnet); skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out(net, sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = 0; iptunnel_xmit_stats(dev, pkt_len); } } EXPORT_SYMBOL_GPL(iptunnel_xmit); int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; skb_pull_rcsum(skb, hdr_len); if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) return -ENOMEM; eh = (struct ethhdr *)skb->data; if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); } else { skb->protocol = inner_proto; } skb_clear_hash_if_not_l4(skb); __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); return iptunnel_pull_offloads(skb); } EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) { IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; struct metadata_dst *res; struct ip_tunnel_info *dst, *src; if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) return NULL; src = &md->u.tun_info; res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags); dst->mode = src->mode | IP_TUNNEL_INFO_TX; ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), src->options_len, tun_flags); return res; } EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask) { int err; if (likely(!skb->encapsulation)) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } if (skb_is_gso(skb)) { err = skb_header_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type |= gso_type_mask; return 0; } if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; /* We clear encapsulation here to prevent badly-written * drivers potentially deciding to offload an inner checksum * if we set CHECKSUM_PARTIAL on the outer header. * This should go away when the drivers are all fixed. */ skb->encapsulation = 0; } return 0; } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); /** * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD * @skb: Original packet with L2 header * @mtu: MTU value for ICMP error * * Return: length on success, negative error code if message couldn't be built. */ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) { const struct iphdr *iph = ip_hdr(skb); struct icmphdr *icmph; struct iphdr *niph; struct ethhdr eh; int len, err; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); if (err) return err; len = skb->len + sizeof(*icmph); err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); if (err) return err; icmph = skb_push(skb, sizeof(*icmph)); *icmph = (struct icmphdr) { .type = ICMP_DEST_UNREACH, .code = ICMP_FRAG_NEEDED, .checksum = 0, .un.frag.__unused = 0, .un.frag.mtu = htons(mtu), }; icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0)); skb_reset_transport_header(skb); niph = skb_push(skb, sizeof(*niph)); *niph = (struct iphdr) { .ihl = sizeof(*niph) / 4u, .version = 4, .tos = 0, .tot_len = htons(len + sizeof(*niph)), .id = 0, .frag_off = htons(IP_DF), .ttl = iph->ttl, .protocol = IPPROTO_ICMP, .saddr = iph->daddr, .daddr = iph->saddr, }; ip_send_check(niph); skb_reset_network_header(skb); skb->ip_summed = CHECKSUM_NONE; eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; } /** * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed * @skb: Buffer being sent by encapsulation, L2 headers expected * @mtu: Network MTU for path * * Return: 0 for no ICMP reply, length if built, negative value on error. */ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) { const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) || ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) return 0; if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) return 0; return iptunnel_pmtud_build_icmp(skb, mtu); } #if IS_ENABLED(CONFIG_IPV6) /** * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD * @skb: Original packet with L2 header * @mtu: MTU value for ICMPv6 error * * Return: length on success, negative error code if message couldn't be built. */ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct icmp6hdr *icmp6h; struct ipv6hdr *nip6h; struct ethhdr eh; int len, err; __wsum csum; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); if (err) return err; len = skb->len + sizeof(*icmp6h); err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN); if (err) return err; icmp6h = skb_push(skb, sizeof(*icmp6h)); *icmp6h = (struct icmp6hdr) { .icmp6_type = ICMPV6_PKT_TOOBIG, .icmp6_code = 0, .icmp6_cksum = 0, .icmp6_mtu = htonl(mtu), }; skb_reset_transport_header(skb); nip6h = skb_push(skb, sizeof(*nip6h)); *nip6h = (struct ipv6hdr) { .priority = 0, .version = 6, .flow_lbl = { 0 }, .payload_len = htons(len), .nexthdr = IPPROTO_ICMPV6, .hop_limit = ip6h->hop_limit, .saddr = ip6h->daddr, .daddr = ip6h->saddr, }; skb_reset_network_header(skb); csum = skb_checksum(skb, skb_transport_offset(skb), len, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, IPPROTO_ICMPV6, csum); skb->ip_summed = CHECKSUM_NONE; eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; } /** * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed * @skb: Buffer being sent by encapsulation, L2 headers expected * @mtu: Network MTU for path * * Return: 0 for no ICMPv6 reply, length if built, negative value on error. */ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int stype = ipv6_addr_type(&ip6h->saddr); u8 proto = ip6h->nexthdr; __be16 frag_off; int offset; if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || stype == IPV6_ADDR_LOOPBACK) return 0; offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag_off); if (offset < 0 || (frag_off & htons(~0x7))) return 0; if (proto == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6h; if (!pskb_may_pull(skb, skb_network_header(skb) + offset + 1 - skb->data)) return 0; icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset); if (icmpv6_is_err(icmp6h->icmp6_type) || icmp6h->icmp6_type == NDISC_REDIRECT) return 0; } return iptunnel_pmtud_build_icmpv6(skb, mtu); } #endif /* IS_ENABLED(CONFIG_IPV6) */ /** * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed * @skb: Buffer being sent by encapsulation, L2 headers expected * @encap_dst: Destination for tunnel encapsulation (outer IP) * @headroom: Encapsulation header size, bytes * @reply: Build matching ICMP or ICMPv6 message as a result * * L2 tunnel implementations that can carry IP and can be directly bridged * (currently UDP tunnels) can't always rely on IP forwarding paths to handle * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built * based on payload and sent back by the encapsulation itself. * * For routable interfaces, we just need to update the PMTU for the destination. * * Return: 0 if ICMP error not needed, length if built, negative value on error */ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply) { u32 mtu = dst_mtu(encap_dst) - headroom; if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu)) return 0; skb_dst_update_pmtu_no_confirm(skb, mtu); if (!reply || skb->pkt_type == PACKET_HOST) return 0; if (skb->protocol == htons(ETH_P_IP)) return iptunnel_pmtud_check_icmp(skb, mtu); #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) return iptunnel_pmtud_check_icmpv6(skb, mtu); #endif return 0; } EXPORT_SYMBOL(skb_tunnel_check_pmtu); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, }; static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, }; static const struct nla_policy vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; static const struct nla_policy erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; static int ip_tun_parse_opts_geneve(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; int data_len, err; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, geneve_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) return -EINVAL; attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; data_len = nla_len(attr); if (data_len % 4) return -EINVAL; if (info) { struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; memcpy(opt->opt_data, nla_data(attr), data_len); opt->length = data_len / 4; attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; opt->opt_class = nla_get_be16(attr); attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; opt->type = nla_get_u8(attr); __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); } return sizeof(struct geneve_opt) + data_len; } static int ip_tun_parse_opts_vxlan(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; int err; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, vxlan_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) return -EINVAL; if (info) { struct vxlan_metadata *md = ip_tunnel_info_opts(info) + opts_len; attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); md->gbp &= VXLAN_GBP_MASK; __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct vxlan_metadata); } static int ip_tun_parse_opts_erspan(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; int err; u8 ver; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, erspan_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) return -EINVAL; ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); if (ver == 1) { if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) return -EINVAL; } else if (ver == 2) { if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) return -EINVAL; } else { return -EINVAL; } if (info) { struct erspan_metadata *md = ip_tunnel_info_opts(info) + opts_len; md->version = ver; if (ver == 1) { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; md->u.index = nla_get_be32(attr); } else { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(attr); attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(attr)); } __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct erspan_metadata); } static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { int err, rem, opt_len, opts_len = 0; struct nlattr *nla; u32 type = 0; if (!attr) return 0; err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, ip_opts_policy, extack); if (err) return err; nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case LWTUNNEL_IP_OPTS_GENEVE: if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) return -EINVAL; type = IP_TUNNEL_GENEVE_OPT_BIT; break; case LWTUNNEL_IP_OPTS_VXLAN: if (type) return -EINVAL; opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = IP_TUNNEL_VXLAN_OPT_BIT; break; case LWTUNNEL_IP_OPTS_ERSPAN: if (type) return -EINVAL; opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = IP_TUNNEL_ERSPAN_OPT_BIT; break; default: return -EINVAL; } } return opts_len; } static int ip_tun_get_optlen(struct nlattr *attr, struct netlink_ext_ack *extack) { return ip_tun_parse_opts(attr, NULL, extack); } static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { return ip_tun_parse_opts(attr, info, extack); } static int ip_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; struct lwtunnel_state *new_state; struct ip_tunnel_info *tun_info; int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); if (opt_len < 0) return opt_len; new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; new_state->type = LWTUNNEL_ENCAP_IP; tun_info = lwt_tun_info(new_state); err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); if (err < 0) { lwtstate_free(new_state); return err; } #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { lwtstate_free(new_state); return err; } #endif if (tb[LWTUNNEL_IP_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) { IP_TUNNEL_DECLARE_FLAGS(flags); ip_tunnel_flags_from_be16(flags, nla_get_be16(tb[LWTUNNEL_IP_FLAGS])); ip_tunnel_clear_options_present(flags); ip_tunnel_flags_or(tun_info->key.tun_flags, tun_info->key.tun_flags, flags); } tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = opt_len; *ts = new_state; return 0; } static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) { #ifdef CONFIG_DST_CACHE struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); dst_cache_destroy(&tun_info->dst_cache); #endif } static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct geneve_opt *opt; struct nlattr *nest; int offset = 0; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); if (!nest) return -ENOMEM; while (tun_info->options_len > offset) { opt = ip_tunnel_info_opts(tun_info) + offset; if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, opt->opt_data)) { nla_nest_cancel(skb, nest); return -ENOMEM; } offset += sizeof(*opt) + opt->length * 4; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct vxlan_metadata *md; struct nlattr *nest; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); if (!nest) return -ENOMEM; md = ip_tunnel_info_opts(tun_info); if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { nla_nest_cancel(skb, nest); return -ENOMEM; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct erspan_metadata *md; struct nlattr *nest; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); if (!nest) return -ENOMEM; md = ip_tunnel_info_opts(tun_info); if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) goto err; if (md->version == 1 && nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) goto err; if (md->version == 2 && (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, get_hwid(&md->u.md2)))) goto err; nla_nest_end(skb, nest); return 0; err: nla_nest_cancel(skb, nest); return -ENOMEM; } static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, struct ip_tunnel_info *tun_info) { struct nlattr *nest; int err = 0; if (!ip_tunnel_is_options_present(tun_info->key.tun_flags)) return 0; nest = nla_nest_start_noflag(skb, type); if (!nest) return -ENOMEM; if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_geneve(skb, tun_info); else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_erspan(skb, tun_info); if (err) { nla_nest_cancel(skb, nest); return err; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id, LWTUNNEL_IP_PAD) || nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP_FLAGS, ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) { int opt_len; if (!ip_tunnel_is_options_present(info->key.tun_flags)) return 0; opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { struct geneve_opt *opt; int offset = 0; opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ while (info->options_len > offset) { opt = ip_tunnel_info_opts(info) + offset; opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + nla_total_size(1) /* OPT_GENEVE_TYPE */ + nla_total_size(opt->length * 4); /* OPT_GENEVE_DATA */ offset += sizeof(*opt) + opt->length * 4; } } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { struct erspan_metadata *md = ip_tunnel_info_opts(info); opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + nla_total_size(1) /* OPT_ERSPAN_VER */ + (md->version == 1 ? nla_total_size(4) /* OPT_ERSPAN_INDEX (v1) */ : nla_total_size(1) + nla_total_size(1)); /* OPT_ERSPAN_DIR + HWID (v2) */ } return opt_len; } static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ + nla_total_size(4) /* LWTUNNEL_IP_DST */ + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct ip_tunnel_info *info_a = lwt_tun_info(a); struct ip_tunnel_info *info_b = lwt_tun_info(b); return memcmp(info_a, info_b, sizeof(info_a->key)) || info_a->mode != info_b->mode || info_a->options_len != info_b->options_len || memcmp(ip_tunnel_info_opts(info_a), ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .build_state = ip_tun_build_state, .destroy_state = ip_tun_destroy_state, .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, .owner = THIS_MODULE, }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; struct lwtunnel_state *new_state; struct ip_tunnel_info *tun_info; int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); if (opt_len < 0) return opt_len; new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; new_state->type = LWTUNNEL_ENCAP_IP6; tun_info = lwt_tun_info(new_state); err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); if (err < 0) { lwtstate_free(new_state); return err; } if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); if (tb[LWTUNNEL_IP6_DST]) tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); if (tb[LWTUNNEL_IP6_SRC]) tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); if (tb[LWTUNNEL_IP6_HOPLIMIT]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 data; data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); ip_tunnel_flags_from_be16(flags, data); ip_tunnel_clear_options_present(flags); ip_tunnel_flags_or(tun_info->key.tun_flags, tun_info->key.tun_flags, flags); } tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = opt_len; *ts = new_state; return 0; } static int ip6_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id, LWTUNNEL_IP6_PAD) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; } static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */ + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { .build_state = ip6_tun_build_state, .fill_encap = ip6_tun_fill_encap_info, .get_encap_size = ip6_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, .owner = THIS_MODULE, }; void __init ip_tunnel_core_init(void) { /* If you land here, make sure whether increasing ip_tunnel_info's * options_len is a reasonable choice with its usage in front ends * (f.e., it's part of flow keys, etc). */ BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); EXPORT_SYMBOL(ip_tunnel_metadata_cnt); void ip_tunnel_need_metadata(void) { static_branch_inc(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); void ip_tunnel_unneed_metadata(void) { static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); /* Returns either the correct skb->protocol value, or 0 if invalid. */ __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb) { if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && ip_hdr(skb)->version == 4) return htons(ETH_P_IP); if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && ipv6_hdr(skb)->version == 6) return htons(ETH_P_IPV6); return 0; } EXPORT_SYMBOL(ip_tunnel_parse_protocol); const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; EXPORT_SYMBOL(ip_tunnel_header_ops); /* This function returns true when ENCAP attributes are present in the nl msg */ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap) { bool ret = false; memset(encap, 0, sizeof(*encap)); if (!data) return ret; if (data[IFLA_IPTUN_ENCAP_TYPE]) { ret = true; encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); } if (data[IFLA_IPTUN_ENCAP_FLAGS]) { ret = true; encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); } if (data[IFLA_IPTUN_ENCAP_SPORT]) { ret = true; encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); } if (data[IFLA_IPTUN_ENCAP_DPORT]) { ret = true; encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); } return ret; } EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) { parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); if (parms->iph.ttl) parms->iph.frag_off = htons(IP_DF); } if (data[IFLA_IPTUN_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); if (data[IFLA_IPTUN_FLAGS]) { __be16 flags; flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); ip_tunnel_flags_from_be16(parms->i_flags, flags); } if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); } EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
6 5 1 10 10 4 6 4 2 2 2 2 1 1 2 59 15 54 50 1 8 67 7 40 32 72 4 1 114 6 5 4 120 119 107 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_table_is_orphan(const struct nft_table *table) { return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == NFT_TABLE_F_PERSIST; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context * @data: internal information related to the transaction */ struct nft_trans { struct list_head list; struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; char data[]; }; struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) #define nft_trans_flow_rule(trans) \ (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) #define nft_trans_rule_bound(trans) \ (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) #define nft_trans_set_update(trans) \ (((struct nft_trans_set *)trans->data)->update) #define nft_trans_set_timeout(trans) \ (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) #define nft_trans_set_size(trans) \ (((struct nft_trans_set *)trans->data)->size) struct nft_trans_chain { struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_chain(trans) \ (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ (((struct nft_trans_chain *)trans->data)->name) #define nft_trans_chain_stats(trans) \ (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) #define nft_trans_chain_bound(trans) \ (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ (((struct nft_trans_chain *)trans->data)->basechain) #define nft_trans_chain_hooks(trans) \ (((struct nft_trans_chain *)trans->data)->hook_list) struct nft_trans_table { bool update; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) struct nft_trans_elem { struct nft_set *set; struct nft_elem_priv *elem_priv; bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem_priv(trans) \ (((struct nft_trans_elem *)trans->data)->elem_priv) #define nft_trans_elem_set_bound(trans) \ (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) #define nft_trans_obj_newobj(trans) \ (((struct nft_trans_obj *)trans->data)->newobj) #define nft_trans_obj_update(trans) \ (((struct nft_trans_obj *)trans->data)->update) struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; u32 flags; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) #define nft_trans_flowtable_update(trans) \ (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */
512 514 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU-based infrastructure for lightweight reader-writer locking * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> */ #ifndef _LINUX_RCU_SYNC_H_ #define _LINUX_RCU_SYNC_H_ #include <linux/wait.h> #include <linux/rcupdate.h> /* Structure to mediate between updaters and fastpath-using readers. */ struct rcu_sync { int gp_state; int gp_count; wait_queue_head_t gp_wait; struct rcu_head cb_head; }; /** * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? * @rsp: Pointer to rcu_sync structure to use for synchronization * * Returns true if readers are permitted to use their fastpaths. Must be * invoked within some flavor of RCU read-side critical section. */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), "suspicious rcu_sync_is_idle() usage"); return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } extern void rcu_sync_init(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); #define __RCU_SYNC_INITIALIZER(name) { \ .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ } #define DEFINE_RCU_SYNC(name) \ struct rcu_sync name = __RCU_SYNC_INITIALIZER(name) #endif /* _LINUX_RCU_SYNC_H_ */
1 1 1 30 19 28 10 30 1 1 1 1 1 30 30 30 29 30 14 14 14 14 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* * Recalculate wakeup batch when tag is shared by hctx. */ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, unsigned int users) { if (!users) return; sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, users); sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, users); } /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, * it avoids dirtying the cacheline if the queue is already active. */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues - 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt = &tags->bitmap_tags; unsigned long ret; if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) return 0; ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); *offset += tags->nr_reserved_tags; return ret; } unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } bt = &tags->breserved_tags; tag_offset = 0; } else { bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG; ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); sbitmap_finish_wait(bt, ws, &wait); found_tag: /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. */ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset); return BLK_MQ_NO_TAG; } return tag + tag_offset; } void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) { sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, tag_array, nr_tags); } struct bt_iter_data { struct blk_mq_hw_ctx *hctx; struct request_queue *q; busy_tag_iter_fn *fn; void *data; bool reserved; }; static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; unsigned long flags; spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; } static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct request_queue *q = iter_data->q; struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_tags *tags; struct request *rq; bool ret = true; if (blk_mq_is_shared_tags(set->flags)) tags = set->shared_tags; else tags = hctx->tags; if (!iter_data->reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(rq, iter_data->data); blk_mq_put_rq_ref(rq); return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) * where rq is a pointer to a request. Return true to continue * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } struct bt_tags_iter_data { struct blk_mq_tags *tags; busy_tag_iter_fn *fn; void *data; unsigned int flags; }; #define BT_TAG_ITER_RESERVED (1 << 0) #define BT_TAG_ITER_STARTED (1 << 1) #define BT_TAG_ITER_STATIC_RQS (1 << 2) static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) ret = iter_data->fn(rq, iter_data->data); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; } /** * bt_tags_for_each - iterate over the requests in a tag map * @tags: Tag map to iterate over. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, * @reserved) where rq is a pointer to a request. Return true * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, unsigned int flags) { struct bt_tags_iter_data iter_data = { .tags = tags, .fn = fn, .data = data, .flags = flags, }; if (tags->rqs) sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv, unsigned int flags) { WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. */ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; int i, nr_tags; nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; if (blk_mq_request_completed(rq)) (*count)++; return true; } /** * blk_mq_tagset_wait_completed_request - Wait until all scheduled request * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown */ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) { while (true) { unsigned count = 0; blk_mq_tagset_busy_iter(tagset, blk_mq_tagset_count_completed_rqs, &count); if (!count) break; msleep(5); } } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, * reserved) where rq is a pointer to a request and hctx points * to the hardware queue associated with the request. 'reserved' * indicates whether or not @rq is a reserved request. * @priv: Will be passed as third argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) bt_for_each(NULL, q, bresv, fn, priv, true); bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; /* * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) continue; if (tags->nr_reserved_tags) bt_for_each(hctx, q, bresv, fn, priv, true); bt_for_each(hctx, q, btags, fn, priv, false); } } blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, node); } int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy) { unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; return 0; free_bitmap_tags: sbitmap_queue_free(bitmap_tags); return -ENOMEM; } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) { struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; } tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); if (!tags) return NULL; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, total_tags, reserved_tags, node, alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } void blk_mq_free_tags(struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tagsptr, unsigned int tdepth, bool can_grow) { struct blk_mq_tags *tags = *tagsptr; if (tdepth <= tags->nr_reserved_tags) return -EINVAL; /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; if (!can_grow) return -EINVAL; /* * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ if (tdepth > MAX_SCHED_RQ) return -EINVAL; /* * Only the sbitmap needs resizing since we allocated the max * initially. */ if (blk_mq_is_shared_tags(set->flags)) return 0; new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { struct blk_mq_tags *tags = set->shared_tags; sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); } void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) { sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, q->nr_requests - q->tag_set->reserved_tags); } /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag * * The tag field in struct request is unique per hardware queue but not over * all hardware queues. Hence this function that returns a tag with the * hardware context index in the upper bits and the per hardware queue tag in * the lower bits. * * Note: When called for a request that is queued on a non-multiqueue request * queue, the hardware context index is set to zero. */ u32 blk_mq_unique_tag(struct request *rq) { return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag);
14 14 6 14 13 8 14 14 14 6 5 5 5 6 1 3 7 9 1 1 8 8 7 5 1 4 5 5 1 5 2 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 3 3 6 6 6 6 6 6 1 1 39 39 7 7 7 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/stat.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> #include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include "internal.h" #include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask * @inode: Inode to use as the source * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); stat->mtime = inode_get_mtime(inode); stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { stat->result_mask |= STATX_CHANGE_COOKIE; stat->change_cookie = inode_query_iversion(inode); } } EXPORT_SYMBOL(generic_fillattr); /** * generic_fill_statx_attr - Fill in the statx attributes from the inode flags * @inode: Inode to use as the source * @stat: Where to fill in the attribute flags * * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the * inode that are published on i_flags and enforced by the VFS. */ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) { if (inode->i_flags & S_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (inode->i_flags & S_APPEND) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS; } EXPORT_SYMBOL(generic_fill_statx_attr); /** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in * @request_mask: STATX_xxx flags indicating what the caller wants * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Get attributes without calling security_inode_getattr. * * Currently the only caller other than vfs_getattr is internal to the * filehandle lookup code, which uses only the inode number and returns no * attributes to any user. Any other code probably wants vfs_getattr. */ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct mnt_idmap *idmap; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; query_flags &= AT_STATX_SYNC_TYPE; /* allow the fs to override these if it really wants to */ /* SB_NOATIME means filesystem supplies dummy atime value */ if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. */ if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags | AT_GETATTR_NOSEC); generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); /* * vfs_getattr - Get the enhanced basic attributes of a file * @path: The file of interest * @stat: Where to return the statistics * @request_mask: STATX_xxx flags indicating what the caller wants * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Ask the filesystem for a file's attributes. The caller must indicate in * request_mask and query_flags to indicate what they want. * * If the file is remote, the filesystem can be forced to update the attributes * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can * suppress the update by passing AT_STATX_DONT_SYNC. * * Bits must have been set in request_mask to indicate which attributes the * caller wants retrieving. Any such attribute not requested may be returned * anyway, but the value may be approximate, and, if remote, may not have been * synchronised with the server. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ int vfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { int retval; if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) return -EPERM; retval = security_inode_getattr(path); if (retval) return retval; return vfs_getattr_nosec(path, stat, request_mask, query_flags); } EXPORT_SYMBOL(vfs_getattr); /** * vfs_fstat - Get the basic attributes by file descriptor * @fd: The file descriptor referring to the file of interest * @stat: The result structure to fill in. * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a file descriptor to determine the file location. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ int vfs_fstat(int fd, struct kstat *stat) { struct fd f; int error; f = fdget_raw(fd); if (!f.file) return -EBADF; error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0); fdput(f); return error; } int getname_statx_lookup_flags(int flags) { int lookup_flags = 0; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (!(flags & AT_NO_AUTOMOUNT)) lookup_flags |= LOOKUP_AUTOMOUNT; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; return lookup_flags; } /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename * @filename: The name of the file of interest * @flags: Flags to control the query * @stat: The result structure to fill in. * @request_mask: STATX_xxx flags indicating what the caller wants * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a filename and base directory to determine the file location. * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink * at the given name from being referenced. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; unsigned int lookup_flags = getname_statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | AT_STATX_SYNC_TYPE)) return -EINVAL; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = vfs_getattr(&path, stat, request_mask, flags); if (request_mask & STATX_MNT_ID_UNIQUE) { stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; stat->result_mask |= STATX_MNT_ID_UNIQUE; } else { stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; } if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; /* Handle STATX_DIOALIGN for block devices. */ if (request_mask & STATX_DIOALIGN) { struct inode *inode = d_backing_inode(path.dentry); if (S_ISBLK(inode->i_mode)) bdev_statx_dioalign(inode, stat); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { int ret; int statx_flags = flags | AT_NO_AUTOMOUNT; struct filename *name; /* * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) * * If AT_EMPTY_PATH is set, we expect the common case to be that * empty path, and avoid doing all the extra pathname work. */ if (dfd >= 0 && flags == AT_EMPTY_PATH) { char c; ret = get_user(c, filename); if (unlikely(ret)) return ret; if (likely(!c)) return vfs_fstat(dfd, stat); } name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); return ret; } #ifdef __ARCH_WANT_OLD_STAT /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf) { static int warncount = 5; struct __old_kernel_stat tmp; if (warncount > 0) { warncount--; printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n", current->comm); } else if (warncount < 0) { /* it's laughable, but... */ warncount = 0; } memset(&tmp, 0, sizeof(struct __old_kernel_stat)); tmp.st_dev = old_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) return -EOVERFLOW; #endif tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_ctime = stat->ctime.tv_sec; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(stat, const char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_stat(filename, &stat); if (error) return error; return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(lstat, const char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (error) return error; return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_old_stat(&stat, statbuf); return error; } #endif /* __ARCH_WANT_OLD_STAT */ #ifdef __ARCH_WANT_NEW_STAT #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) return -EOVERFLOW; if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) return -EOVERFLOW; #endif INIT_STRUCT_STAT_PADDING(tmp); tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_ctime = stat->ctime.tv_sec; #ifdef STAT_HAVE_NSEC tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; #endif tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(newstat, const char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; int error = vfs_stat(filename, &stat); if (error) return error; return cp_new_stat(&stat, statbuf); } SYSCALL_DEFINE2(newlstat, const char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (error) return error; return cp_new_stat(&stat, statbuf); } #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct stat __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; return cp_new_stat(&stat, statbuf); } #endif SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_new_stat(&stat, statbuf); return error; } #endif static int do_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { struct path path; int error; int empty = 0; unsigned int lookup_flags = LOOKUP_EMPTY; if (bufsiz <= 0) return -EINVAL; retry: error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); if (!error) { struct inode *inode = d_backing_inode(path.dentry); error = empty ? -ENOENT : -EINVAL; /* * AFS mountpoints allow readlink(2) but are not symlinks */ if (d_is_symlink(path.dentry) || inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { touch_atime(&path); error = vfs_readlink(path.dentry, buf, bufsiz); } } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, char __user *, buf, int, bufsiz) { return do_readlinkat(dfd, pathname, buf, bufsiz); } SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, int, bufsiz) { return do_readlinkat(AT_FDCWD, path, buf, bufsiz); } /* ---------- LFS-64 ----------- */ #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) #ifndef INIT_STRUCT_STAT64_PADDING # define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) #endif static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) { struct stat64 tmp; INIT_STRUCT_STAT64_PADDING(tmp); #ifdef CONFIG_MIPS /* mips has weird padding, so we don't get 64 bits there */ tmp.st_dev = new_encode_dev(stat->dev); tmp.st_rdev = new_encode_dev(stat->rdev); #else tmp.st_dev = huge_encode_dev(stat->dev); tmp.st_rdev = huge_encode_dev(stat->rdev); #endif tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; #ifdef STAT64_HAS_BROKEN_ST_INO tmp.__st_ino = stat->ino; #endif tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime = stat->ctime.tv_sec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; tmp.st_size = stat->size; tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(stat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_stat(filename, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE2(lstat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_lstat(filename, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, struct stat64 __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; return cp_new_stat64(&stat, statbuf); } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ static noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; memset(&tmp, 0, sizeof(tmp)); /* STATX_CHANGE_COOKIE is kernel-only for now */ tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.stx_mode = stat->mode; tmp.stx_ino = stat->ino; tmp.stx_size = stat->size; tmp.stx_blocks = stat->blocks; tmp.stx_attributes_mask = stat->attributes_mask; tmp.stx_atime.tv_sec = stat->atime.tv_sec; tmp.stx_atime.tv_nsec = stat->atime.tv_nsec; tmp.stx_btime.tv_sec = stat->btime.tv_sec; tmp.stx_btime.tv_nsec = stat->btime.tv_nsec; tmp.stx_ctime.tv_sec = stat->ctime.tv_sec; tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec; tmp.stx_mtime.tv_sec = stat->mtime.tv_sec; tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec; tmp.stx_rdev_major = MAJOR(stat->rdev); tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); tmp.stx_mnt_id = stat->mnt_id; tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; tmp.stx_subvol = stat->subvol; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer) { struct kstat stat; int error; if (mask & STATX__RESERVED) return -EINVAL; if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests * from userland. */ mask &= ~STATX_CHANGE_COOKIE; error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. * @filename: File to stat or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * * Note that fstat() can be emulated by setting dfd to the fd of interest, * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, unsigned int, mask, struct statx __user *, buffer) { int ret; struct filename *name; name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); return ret; } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) return -EOVERFLOW; if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime = stat->ctime.tv_sec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, struct compat_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_stat(filename, &stat); if (error) return error; return cp_compat_stat(&stat, statbuf); } COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, struct compat_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (error) return error; return cp_compat_stat(&stat, statbuf); } #ifndef __ARCH_WANT_STAT64 COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, const char __user *, filename, struct compat_stat __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; return cp_compat_stat(&stat, statbuf); } #endif COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct compat_stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_compat_stat(&stat, statbuf); return error; } #endif /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; if (inode->i_bytes >= 512) { inode->i_blocks++; inode->i_bytes -= 512; } } EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { spin_lock(&inode->i_lock); __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_add_bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes) { inode->i_blocks -= bytes >> 9; bytes &= 511; if (inode->i_bytes < bytes) { inode->i_blocks--; inode->i_bytes += 512; } inode->i_bytes -= bytes; } EXPORT_SYMBOL(__inode_sub_bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes) { spin_lock(&inode->i_lock); __inode_sub_bytes(inode, bytes); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_sub_bytes); loff_t inode_get_bytes(struct inode *inode) { loff_t ret; spin_lock(&inode->i_lock); ret = __inode_get_bytes(inode); spin_unlock(&inode->i_lock); return ret; } EXPORT_SYMBOL(inode_get_bytes); void inode_set_bytes(struct inode *inode, loff_t bytes) { /* Caller is here responsible for sufficient locking * (ie. inode->i_lock) */ inode->i_blocks = bytes >> 9; inode->i_bytes = bytes & 511; } EXPORT_SYMBOL(inode_set_bytes);
111 108 111 187 111 108 184 108 2 13 13 95 37 96 37 96 80 97 96 53 52 94 53 50 2 2 2 2 2 2 2 2 2 1 2 44 44 44 44 148 27 3 185 27 128 140 150 186 97 94 150 44 1 9 9 5 9 2 9 2 7 7 7 9 9 7 2 2 2 1 1 11 11 1 11 11 11 11 2 2 2 2 2 2 2 67 66 66 35 184 184 172 177 185 2 2 149 35 28 34 1 35 35 17 36 5 16 3 32 10 32 10 31 25 11 11 2 23 36 12 36 12 35 153 1 151 12 149 10 149 151 10 10 5 5 187 54 186 183 174 187 185 67 186 54 186 185 152 186 187 187 184 186 8 8 8 8 8 8 8 8 8 8 5 8 7 8 8 8 8 8 8 5 5 5 5 5 5 5 5 8 8 8 8 8 14 1 13 12 12 4 4 5 9 8 14 5 1 4 1 1 4 1 1 1 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 1 3 1 2 2 2 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Generic Netlink Family * * Authors: Jamal Hadi Salim * Thomas Graf <tgraf@suug.ch> * Johannes Berg <johannes@sipsolutions.net> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/string_helpers.h> #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> #include <linux/rwsem.h> #include <linux/idr.h> #include <net/sock.h> #include <net/genetlink.h> #include "genetlink.h" static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq); void genl_lock(void) { mutex_lock(&genl_mutex); } EXPORT_SYMBOL(genl_lock); void genl_unlock(void) { mutex_unlock(&genl_mutex); } EXPORT_SYMBOL(genl_unlock); static void genl_lock_all(void) { down_write(&cb_lock); genl_lock(); } static void genl_unlock_all(void) { genl_unlock(); up_write(&cb_lock); } static void genl_op_lock(const struct genl_family *family) { if (!family->parallel_ops) genl_lock(); } static void genl_op_unlock(const struct genl_family *family) { if (!family->parallel_ops) genl_unlock(); } static DEFINE_IDR(genl_fam_idr); /* * Bitmap of multicast groups that are currently in use. * * To avoid an allocation at boot of just one unsigned long, * declare it global instead. * Bit 0 is marked as already used since group 0 is invalid. * Bit 1 is marked as already used since the drop-monitor code * abuses the API and thinks it can statically use group 1. * That group will typically conflict with other groups that * any proper users use. * Bit 16 is marked as used since it's used for generic netlink * and the code no longer marks pre-reserved IDs as used. * Bit 17 is marked as already used since the VFS quota code * also abused this API and relied on family == group ID, we * cater to that by giving it a static family and group ID. * Bit 18 is marked as already used since the PMCRAID driver * did the same thing as the VFS quota code (maybe copied?) */ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | BIT(GENL_ID_VFS_DQUOT) | BIT(GENL_ID_PMCRAID); static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; /* We need the last attribute with non-zero ID therefore a 2-entry array */ static struct nla_policy genl_policy_reject_all[] = { { .type = NLA_REJECT }, { .type = NLA_REJECT }, }; static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); static void genl_op_fill_in_reject_policy(const struct genl_family *family, struct genl_ops *op) { BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1); if (op->policy || op->cmd < family->resv_start_op) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static void genl_op_fill_in_reject_policy_split(const struct genl_family *family, struct genl_split_ops *op) { if (op->policy) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); } static const struct genl_family *genl_family_find_byname(char *name) { const struct genl_family *family; unsigned int id; idr_for_each_entry(&genl_fam_idr, family, id) if (strcmp(family->name, name) == 0) return family; return NULL; } struct genl_op_iter { const struct genl_family *family; struct genl_split_ops doit; struct genl_split_ops dumpit; int cmd_idx; int entry_idx; u32 cmd; u8 flags; }; static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { *op = family->ops[i]; if (!op->maxattr) op->maxattr = family->maxattr; if (!op->policy) op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_ops; i++) if (family->ops[i].cmd == cmd) { genl_op_from_full(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_small(const struct genl_family *family, unsigned int i, struct genl_ops *op) { memset(op, 0, sizeof(*op)); op->doit = family->small_ops[i].doit; op->dumpit = family->small_ops[i].dumpit; op->cmd = family->small_ops[i].cmd; op->internal_flags = family->small_ops[i].internal_flags; op->flags = family->small_ops[i].flags; op->validate = family->small_ops[i].validate; op->maxattr = family->maxattr; op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_small_ops; i++) if (family->small_ops[i].cmd == cmd) { genl_op_from_small(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_split(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; int i, cnt = 0; i = iter->entry_idx - family->n_ops - family->n_small_ops; if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) { iter->doit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->doit); cnt++; } else { memset(&iter->doit, 0, sizeof(iter->doit)); } if (i + cnt < family->n_split_ops && family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP && (!cnt || family->split_ops[i + cnt].cmd == iter->doit.cmd)) { iter->dumpit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->dumpit); cnt++; } else { memset(&iter->dumpit, 0, sizeof(iter->dumpit)); } WARN_ON(!cnt); iter->entry_idx += cnt; } static int genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family, struct genl_split_ops *op) { int i; for (i = 0; i < family->n_split_ops; i++) if (family->split_ops[i].cmd == cmd && family->split_ops[i].flags & flag) { *op = family->split_ops[i]; return 0; } return -ENOENT; } static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, const struct genl_ops *full, u8 flags) { if ((flags & GENL_CMD_CAP_DO && !full->doit) || (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) { memset(op, 0, sizeof(*op)); return -ENOENT; } if (flags & GENL_CMD_CAP_DUMP) { op->start = full->start; op->dumpit = full->dumpit; op->done = full->done; } else { op->pre_doit = family->pre_doit; op->doit = full->doit; op->post_doit = family->post_doit; } if (flags & GENL_CMD_CAP_DUMP && full->validate & GENL_DONT_VALIDATE_DUMP) { op->policy = NULL; op->maxattr = 0; } else { op->policy = full->policy; op->maxattr = full->maxattr; } op->cmd = full->cmd; op->internal_flags = full->internal_flags; op->flags = full->flags; op->validate = full->validate; /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */ op->flags |= flags; return 0; } /* Must make sure that op is initialized to 0 on failure */ static int genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, struct genl_split_ops *op) { struct genl_ops full; int err; err = genl_get_cmd_full(cmd, family, &full); if (err == -ENOENT) err = genl_get_cmd_small(cmd, family, &full); /* Found one of legacy forms */ if (err == 0) return genl_cmd_full_to_split(op, family, &full, flags); err = genl_get_cmd_split(cmd, flags, family, op); if (err) memset(op, 0, sizeof(*op)); return err; } /* For policy dumping only, get ops of both do and dump. * Fail if both are missing, genl_get_cmd() will zero-init in case of failure. */ static int genl_get_cmd_both(u32 cmd, const struct genl_family *family, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { int err1, err2; err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit); err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit); return err1 && err2 ? -ENOENT : 0; } static bool genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) { iter->family = family; iter->cmd_idx = 0; iter->entry_idx = 0; iter->flags = 0; return iter->family->n_ops + iter->family->n_small_ops + iter->family->n_split_ops; } static bool genl_op_iter_next(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; bool legacy_op = true; struct genl_ops op; if (iter->entry_idx < family->n_ops) { genl_op_from_full(family, iter->entry_idx, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops) { genl_op_from_small(family, iter->entry_idx - family->n_ops, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops + family->n_split_ops) { legacy_op = false; /* updates entry_idx */ genl_op_from_split(iter); } else { return false; } iter->cmd_idx++; if (legacy_op) { iter->entry_idx++; genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO); genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP); } iter->cmd = iter->doit.cmd | iter->dumpit.cmd; iter->flags = iter->doit.flags | iter->dumpit.flags; return true; } static void genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src) { *dst = *src; } static unsigned int genl_op_iter_idx(struct genl_op_iter *iter) { return iter->cmd_idx; } static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; int start = 0; int i; int id; bool fits; do { if (start == 0) id = find_first_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG); else id = find_next_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG, start); fits = true; for (i = id; i < min_t(int, id + n_groups, mc_groups_longs * BITS_PER_LONG); i++) { if (test_bit(i, mc_groups)) { start = i; fits = false; break; } } if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); if (mc_groups == &mc_group_start) { new_groups = kzalloc(nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; *mc_groups = mc_group_start; } else { new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; for (i = 0; i < BITS_TO_LONGS(n_groups); i++) mc_groups[mc_groups_longs + i] = 0; } mc_groups_longs = new_longs; } } while (!fits); for (i = id; i < id + n_groups; i++) set_bit(i, mc_groups); *first_id = id; return 0; } static struct genl_family genl_ctrl; static int genl_validate_assign_mc_groups(struct genl_family *family) { int first_id; int n_groups = family->n_mcgrps; int err = 0, i; bool groups_allocated = false; if (!n_groups) return 0; for (i = 0; i < n_groups; i++) { const struct genl_multicast_group *grp = &family->mcgrps[i]; if (WARN_ON(grp->name[0] == '\0')) return -EINVAL; if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ))) return -EINVAL; } /* special-case our own group and hacks */ if (family == &genl_ctrl) { first_id = GENL_ID_CTRL; BUG_ON(n_groups != 1); } else if (strcmp(family->name, "NET_DM") == 0) { first_id = 1; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_VFS_DQUOT) { first_id = GENL_ID_VFS_DQUOT; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_PMCRAID) { first_id = GENL_ID_PMCRAID; BUG_ON(n_groups != 1); } else { groups_allocated = true; err = genl_allocate_reserve_groups(n_groups, &first_id); if (err) return err; } family->mcgrp_offset = first_id; /* if still initializing, can't and don't need to realloc bitmaps */ if (!init_net.genl_sock) return 0; if (family->netnsok) { struct net *net; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { err = __netlink_change_ngroups(net->genl_sock, mc_groups_longs * BITS_PER_LONG); if (err) { /* * No need to roll back, can only fail if * memory allocation fails and then the * number of _possible_ groups has been * increased on some sockets which is ok. */ break; } } rcu_read_unlock(); netlink_table_ungrab(); } else { err = netlink_change_ngroups(init_net.genl_sock, mc_groups_longs * BITS_PER_LONG); } if (groups_allocated && err) { for (i = 0; i < family->n_mcgrps; i++) clear_bit(family->mcgrp_offset + i, mc_groups); } return err; } static void genl_unregister_mc_groups(const struct genl_family *family) { struct net *net; int i; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { for (i = 0; i < family->n_mcgrps; i++) __netlink_clear_multicast_users( net->genl_sock, family->mcgrp_offset + i); } rcu_read_unlock(); netlink_table_ungrab(); for (i = 0; i < family->n_mcgrps; i++) { int grp_id = family->mcgrp_offset + i; if (grp_id != 1) clear_bit(grp_id, mc_groups); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, &family->mcgrps[i], grp_id); } } static bool genl_split_op_check(const struct genl_split_ops *op) { if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP)) != 1)) return true; return false; } static int genl_validate_ops(const struct genl_family *family) { struct genl_op_iter i, j; unsigned int s; if (WARN_ON(family->n_ops && !family->ops) || WARN_ON(family->n_small_ops && !family->small_ops) || WARN_ON(family->n_split_ops && !family->split_ops)) return -EINVAL; for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) { if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) return -EINVAL; if (WARN_ON(i.cmd >= family->resv_start_op && (i.doit.validate || i.dumpit.validate))) return -EINVAL; genl_op_iter_copy(&j, &i); while (genl_op_iter_next(&j)) { if (i.cmd == j.cmd) return -EINVAL; } } if (family->n_split_ops) { if (genl_split_op_check(&family->split_ops[0])) return -EINVAL; } for (s = 1; s < family->n_split_ops; s++) { const struct genl_split_ops *a, *b; a = &family->split_ops[s - 1]; b = &family->split_ops[s]; if (genl_split_op_check(b)) return -EINVAL; /* Check sort order */ if (a->cmd < b->cmd) { continue; } else if (a->cmd > b->cmd) { WARN_ON(1); return -EINVAL; } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) { WARN_ON(1); return -EINVAL; } if ((a->flags & GENL_CMD_CAP_DO) && (b->flags & GENL_CMD_CAP_DUMP)) continue; WARN_ON(1); return -EINVAL; } return 0; } static void *genl_sk_priv_alloc(struct genl_family *family) { void *priv; priv = kzalloc(family->sock_priv_size, GFP_KERNEL); if (!priv) return ERR_PTR(-ENOMEM); if (family->sock_priv_init) family->sock_priv_init(priv); return priv; } static void genl_sk_priv_free(const struct genl_family *family, void *priv) { if (family->sock_priv_destroy) family->sock_priv_destroy(priv); kfree(priv); } static int genl_sk_privs_alloc(struct genl_family *family) { if (!family->sock_priv_size) return 0; family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL); if (!family->sock_privs) return -ENOMEM; xa_init(family->sock_privs); return 0; } static void genl_sk_privs_free(const struct genl_family *family) { unsigned long id; void *priv; if (!family->sock_priv_size) return; xa_for_each(family->sock_privs, id, priv) genl_sk_priv_free(family, priv); xa_destroy(family->sock_privs); kfree(family->sock_privs); } static void genl_sk_priv_free_by_sock(struct genl_family *family, struct sock *sk) { void *priv; if (!family->sock_priv_size) return; priv = xa_erase(family->sock_privs, (unsigned long) sk); if (!priv) return; genl_sk_priv_free(family, priv); } static void genl_release(struct sock *sk, unsigned long *groups) { struct genl_family *family; unsigned int id; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) genl_sk_priv_free_by_sock(family, sk); up_read(&cb_lock); } /** * __genl_sk_priv_get - Get family private pointer for socket, if exists * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * * Caller should make sure this is called in RCU read locked section. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(), NULL in case priv does not exist. */ void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk) { if (WARN_ON_ONCE(!family->sock_privs)) return ERR_PTR(-EINVAL); return xa_load(family->sock_privs, (unsigned long) sk); } /** * genl_sk_priv_get - Get family private pointer for socket * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * Allocate the private memory in case it was not already done. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(). */ void *genl_sk_priv_get(struct genl_family *family, struct sock *sk) { void *priv, *old_priv; priv = __genl_sk_priv_get(family, sk); if (priv) return priv; /* priv for the family does not exist so far, create it. */ priv = genl_sk_priv_alloc(family); if (IS_ERR(priv)) return ERR_CAST(priv); old_priv = xa_cmpxchg(family->sock_privs, (unsigned long) sk, NULL, priv, GFP_KERNEL); if (old_priv) { genl_sk_priv_free(family, priv); if (xa_is_err(old_priv)) return ERR_PTR(xa_err(old_priv)); /* Race happened, priv for the socket was already inserted. */ return old_priv; } return priv; } /** * genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. * * The family's ops, multicast groups and module pointer must already * be assigned. * * Return 0 on success or a negative error code. */ int genl_register_family(struct genl_family *family) { int err, i; int start = GENL_START_ALLOC, end = GENL_MAX_ID; err = genl_validate_ops(family); if (err) return err; genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; goto errout_locked; } err = genl_sk_privs_alloc(family); if (err) goto errout_locked; /* * Sadly, a few cases need to be special-cased * due to them having previously abused the API * and having used their family ID also as their * multicast group ID, so we use reserved IDs * for both to be sure we can do that mapping. */ if (family == &genl_ctrl) { /* and this needs to be special for initial family lookups */ start = end = GENL_ID_CTRL; } else if (strcmp(family->name, "pmcraid") == 0) { start = end = GENL_ID_PMCRAID; } else if (strcmp(family->name, "VFS_DQUOT") == 0) { start = end = GENL_ID_VFS_DQUOT; } family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_sk_privs_free; } err = genl_validate_assign_mc_groups(family); if (err) goto errout_remove; genl_unlock_all(); /* send all events */ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); for (i = 0; i < family->n_mcgrps; i++) genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, &family->mcgrps[i], family->mcgrp_offset + i); return 0; errout_remove: idr_remove(&genl_fam_idr, family->id); errout_sk_privs_free: genl_sk_privs_free(family); errout_locked: genl_unlock_all(); return err; } EXPORT_SYMBOL(genl_register_family); /** * genl_unregister_family - unregister generic netlink family * @family: generic netlink family * * Unregisters the specified family. * * Returns 0 on success or a negative error code. */ int genl_unregister_family(const struct genl_family *family) { genl_lock_all(); if (!genl_family_find_byid(family->id)) { genl_unlock_all(); return -ENOENT; } genl_unregister_mc_groups(family); idr_remove(&genl_fam_idr, family->id); up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); genl_sk_privs_free(family); genl_unlock(); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; } EXPORT_SYMBOL(genl_unregister_family); /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to * @seq: sequence number (usually the one of the sender) * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd) { struct nlmsghdr *nlh; struct genlmsghdr *hdr; nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN + family->hdrsize, flags); if (nlh == NULL) return NULL; hdr = nlmsg_data(nlh); hdr->cmd = cmd; hdr->version = family->version; hdr->reserved = 0; return (char *) hdr + GENL_HDRLEN; } EXPORT_SYMBOL(genlmsg_put); static struct genl_dumpit_info *genl_dumpit_info_alloc(void) { return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); } static void genl_dumpit_info_free(const struct genl_dumpit_info *info) { kfree(info); } static struct nlattr ** genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : NL_VALIDATE_STRICT; struct nlattr **attrbuf; int err; if (!ops->maxattr) return NULL; attrbuf = kmalloc_array(ops->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy, validate, extack); if (err) { kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { kfree(attrbuf); } struct genl_start_context { const struct genl_family *family; struct nlmsghdr *nlh; struct netlink_ext_ack *extack; const struct genl_split_ops *ops; int hdrlen; }; static int genl_start(struct netlink_callback *cb) { struct genl_start_context *ctx = cb->data; const struct genl_split_ops *ops; struct genl_dumpit_info *info; struct nlattr **attrs = NULL; int rc = 0; ops = ctx->ops; if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) && ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) return -EINVAL; attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, ops, ctx->hdrlen, GENL_DONT_VALIDATE_DUMP_STRICT); if (IS_ERR(attrs)) return PTR_ERR(attrs); info = genl_dumpit_info_alloc(); if (!info) { genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->op = *ops; info->info.family = ctx->family; info->info.snd_seq = cb->nlh->nlmsg_seq; info->info.snd_portid = NETLINK_CB(cb->skb).portid; info->info.nlhdr = cb->nlh; info->info.genlhdr = nlmsg_data(cb->nlh); info->info.attrs = attrs; genl_info_net_set(&info->info, sock_net(cb->skb->sk)); info->info.extack = cb->extack; memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); cb->data = info; if (ops->start) { genl_op_lock(ctx->family); rc = ops->start(cb); genl_op_unlock(ctx->family); } if (rc) { genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); cb->data = NULL; } return rc; } static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc; info->extack = cb->extack; genl_op_lock(info->family); rc = ops->dumpit(skb, cb); genl_op_unlock(info->family); return rc; } static int genl_done(struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc = 0; info->extack = cb->extack; if (ops->done) { genl_op_lock(info->family); rc = ops->done(cb); genl_op_unlock(info->family); } genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(dump_info); return rc; } static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct genl_start_context ctx; struct netlink_dump_control c = { .module = family->module, .data = &ctx, .start = genl_start, .dump = genl_dumpit, .done = genl_done, .extack = extack, }; int err; ctx.family = family; ctx.nlh = nlh; ctx.extack = extack; ctx.ops = ops; ctx.hdrlen = hdrlen; genl_op_unlock(family); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_op_lock(family); return err; } static int genl_family_rcv_msg_doit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct nlattr **attrbuf; struct genl_info info; int err; attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; info.family = family; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); if (ops->pre_doit) { err = ops->pre_doit(ops, skb, &info); if (err) goto out; } err = ops->doit(skb, &info); if (ops->post_doit) ops->post_doit(ops, skb, &info); out: genl_family_rcv_msg_attrs_free(attrbuf); return err; } static int genl_header_check(const struct genl_family *family, struct nlmsghdr *nlh, struct genlmsghdr *hdr, struct netlink_ext_ack *extack) { u16 flags; /* Only for commands added after we started validating */ if (hdr->cmd < family->resv_start_op) return 0; if (hdr->reserved) { NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0"); return -EINVAL; } /* Old netlink flags have pretty loose semantics, allow only the flags * consumed by the core where we can enforce the meaning. */ flags = nlh->nlmsg_flags; if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */ flags &= ~NLM_F_DUMP; if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) { NL_SET_ERR_MSG(extack, "ambiguous or reserved bits set in nlmsg_flags"); return -EINVAL; } return 0; } static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct genlmsghdr *hdr = nlmsg_data(nlh); struct genl_split_ops op; int hdrlen; u8 flags; /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; if (genl_header_check(family, nlh, hdr, extack)) return -EINVAL; flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ? GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO; if (genl_get_cmd(hdr->cmd, flags, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((op.flags & GENL_UNS_ADMIN_PERM) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (flags & GENL_CMD_CAP_DUMP) return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, &op, hdrlen, net); else return genl_family_rcv_msg_doit(family, skb, nlh, extack, &op, hdrlen, net); } static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { const struct genl_family *family; int err; family = genl_family_find_byid(nlh->nlmsg_type); if (family == NULL) return -ENOENT; genl_op_lock(family); err = genl_family_rcv_msg(family, skb, nlh, extack); genl_op_unlock(family); return err; } static void genl_rcv(struct sk_buff *skb) { down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); up_read(&cb_lock); } /************************************************************************** * Controller **************************************************************************/ static struct genl_family genl_ctrl; static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { struct genl_op_iter i; void *hdr; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -EMSGSIZE; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) || nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) || nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) goto nla_put_failure; if (genl_op_iter_init(family, &i)) { struct nlattr *nla_ops; nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; while (genl_op_iter_next(&i)) { struct nlattr *nest; u32 op_flags; op_flags = i.flags; if (i.doit.policy || i.dumpit.policy) op_flags |= GENL_CMD_CAP_HASPOL; nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i)); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) || nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_ops); } if (family->n_mcgrps) { struct nlattr *nla_grps; int i; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; for (i = 0; i < family->n_mcgrps; i++) { struct nlattr *nest; const struct genl_multicast_group *grp; grp = &family->mcgrps[i]; nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, family->mcgrp_offset + i) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_grps); } genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_fill_mcgrp_info(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; struct nlattr *nla_grps; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) goto nla_put_failure; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; nest = nla_nest_start_noflag(skb, 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) { int n = 0; struct genl_family *rt; struct net *net = sock_net(skb->sk); int fams_to_skip = cb->args[0]; unsigned int id; int err = 0; idr_for_each_entry(&genl_fam_idr, rt, id) { if (!rt->netnsok && !net_eq(net, &init_net)) continue; if (n++ < fams_to_skip) continue; err = ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, CTRL_CMD_NEWFAMILY); if (err) { n--; break; } } cb->args[0] = n; return err; } static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_info(family, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static struct sk_buff * ctrl_build_mcgrp_msg(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static const struct nla_policy ctrl_policy_family[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, }; static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; const struct genl_family *res = NULL; int err = -EINVAL; if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { char *name; name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); #ifdef CONFIG_MODULES if (res == NULL) { genl_unlock(); up_read(&cb_lock); request_module("net-pf-%d-proto-%d-family-%s", PF_NETLINK, NETLINK_GENERIC, name); down_read(&cb_lock); genl_lock(); res = genl_family_find_byname(name); } #endif err = -ENOENT; } if (res == NULL) return err; if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { /* family doesn't exist here */ return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq, CTRL_CMD_NEWFAMILY); if (IS_ERR(msg)) return PTR_ERR(msg); return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id) { struct sk_buff *msg; /* genl is still initialising */ if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: WARN_ON(grp); msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: BUG_ON(!grp); msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event); break; default: return -EINVAL; } if (IS_ERR(msg)) return PTR_ERR(msg); if (!family->netnsok) { genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); } else { rcu_read_lock(); genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0, GFP_ATOMIC); rcu_read_unlock(); } return 0; } struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; struct genl_op_iter *op_iter; u32 op; u16 fam_id; u8 dump_map:1, single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; const struct genl_family *rt; struct genl_op_iter i; int err; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) return -EINVAL; if (tb[CTRL_ATTR_FAMILY_ID]) { ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); } else { rt = genl_family_find_byname( nla_data(tb[CTRL_ATTR_FAMILY_NAME])); if (!rt) return -ENOENT; ctx->fam_id = rt->id; } rt = genl_family_find_byid(ctx->fam_id); if (!rt) return -ENOENT; ctx->rt = rt; if (tb[CTRL_ATTR_OP]) { struct genl_split_ops doit, dump; ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); err = genl_get_cmd_both(ctx->op, rt, &doit, &dump); if (err) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); return err; } if (doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, doit.policy, doit.maxattr); if (err) goto err_free_state; } if (dump.policy) { err = netlink_policy_dump_add_policy(&ctx->state, dump.policy, dump.maxattr); if (err) goto err_free_state; } if (!ctx->state) return -ENODATA; ctx->dump_map = 1; return 0; } ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); if (!ctx->op_iter) return -ENOMEM; genl_op_iter_init(rt, ctx->op_iter); ctx->dump_map = genl_op_iter_next(ctx->op_iter); for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { if (i.doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.doit.policy, i.doit.maxattr); if (err) goto err_free_state; } if (i.dumpit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.dumpit.policy, i.dumpit.maxattr); if (err) goto err_free_state; } } if (!ctx->state) { err = -ENODATA; goto err_free_op_iter; } return 0; err_free_state: netlink_policy_dump_free(ctx->state); err_free_op_iter: kfree(ctx->op_iter); return err; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_ctrl, NLM_F_MULTI, CTRL_CMD_GETPOLICY); if (!hdr) return NULL; if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id)) return NULL; return hdr; } static int ctrl_dumppolicy_put_op(struct sk_buff *skb, struct netlink_callback *cb, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr *nest_pol, *nest_op; void *hdr; int idx; /* skip if we have nothing to show */ if (!doit->policy && !dumpit->policy) return 0; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) return -ENOBUFS; nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); if (!nest_pol) goto err; nest_op = nla_nest_start(skb, doit->cmd); if (!nest_op) goto err; if (doit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, doit->policy, doit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) goto err; } if (dumpit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, dumpit->policy, dumpit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) goto err; } nla_nest_end(skb, nest_op); nla_nest_end(skb, nest_pol); genlmsg_end(skb, hdr); return 0; err: genlmsg_cancel(skb, hdr); return -ENOBUFS; } static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; if (ctx->dump_map) { if (ctx->single_op) { struct genl_split_ops doit, dumpit; if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt, &doit, &dumpit))) return -ENOENT; if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; /* done with the per-op policy index list */ ctx->dump_map = 0; } while (ctx->dump_map) { if (ctrl_dumppolicy_put_op(skb, cb, &ctx->op_iter->doit, &ctx->op_iter->dumpit)) return skb->len; ctx->dump_map = genl_op_iter_next(ctx->op_iter); } } while (netlink_policy_dump_loop(ctx->state)) { struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) goto nla_put_failure; nest = nla_nest_start(skb, CTRL_ATTR_POLICY); if (!nest) goto nla_put_failure; if (netlink_policy_dump_write(skb, ctx->state)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); } return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; kfree(ctx->op_iter); netlink_policy_dump_free(ctx->state); return 0; } static const struct genl_split_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_STRICT, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .doit = ctrl_getfamily, .flags = GENL_CMD_CAP_DO, }, { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_DUMP, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .dumpit = ctrl_dumpfamily, .flags = GENL_CMD_CAP_DUMP, }, { .cmd = CTRL_CMD_GETPOLICY, .policy = ctrl_policy_policy, .maxattr = ARRAY_SIZE(ctrl_policy_policy) - 1, .start = ctrl_dumppolicy_start, .dumpit = ctrl_dumppolicy, .done = ctrl_dumppolicy_done, .flags = GENL_CMD_CAP_DUMP, }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, .split_ops = genl_ctrl_ops, .n_split_ops = ARRAY_SIZE(genl_ctrl_ops), .resv_start_op = CTRL_CMD_GETPOLICY + 1, .mcgrps = genl_ctrl_groups, .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), .id = GENL_ID_CTRL, .name = "nlctrl", .version = 0x2, .netnsok = true, }; static int genl_bind(struct net *net, int group) { const struct genl_family *family; unsigned int id; int ret = 0; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) { const struct genl_multicast_group *grp; int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; grp = &family->mcgrps[i]; if ((grp->flags & GENL_MCAST_CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; if ((grp->flags & GENL_MCAST_CAP_SYS_ADMIN) && !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; if (family->bind) family->bind(i); break; } up_read(&cb_lock); return ret; } static void genl_unbind(struct net *net, int group) { const struct genl_family *family; unsigned int id; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) { int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; if (family->unbind) family->unbind(i); break; } up_read(&cb_lock); } static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, .unbind = genl_unbind, .release = genl_release, }; /* we'll bump the group number right afterwards */ net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); if (!net->genl_sock) return -ENOMEM; return 0; } static void __net_exit genl_pernet_exit(struct net *net) { netlink_kernel_release(net->genl_sock); net->genl_sock = NULL; } static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, }; static int __init genl_init(void) { int err; err = genl_register_family(&genl_ctrl); if (err < 0) goto problem; err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; return 0; problem: panic("GENL: Cannot register controller: %d\n", err); } core_initcall(genl_init); static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; for_each_net_rcu(net) { if (prev) { tmp = skb_clone(skb, flags); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) goto error; } prev = net; } err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group, flags); } EXPORT_SYMBOL(genlmsg_multicast_allns); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags) { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify);
1 7 2 14 2 15 3 1 3 3 3 8 12 41 65 3 3 3 3 9 2 1 1 1 1 1 8 1 8 8 8 3 1 9 9 11 12 7 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/gro_cells.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) #define __ipt_flag_op(op, ...) \ op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) #define IP_TUNNEL_DECLARE_FLAGS(...) \ __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) #define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) #define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) #define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) #define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) #define ip_tunnel_flags_empty(...) \ __ipt_flag_op(bitmap_empty, __VA_ARGS__) #define ip_tunnel_flags_intersect(...) \ __ipt_flag_op(bitmap_intersects, __VA_ARGS__) #define ip_tunnel_flags_subset(...) \ __ipt_flag_op(bitmap_subset, __VA_ARGS__) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ struct ip_tunnel_info * : ((void *)((info) + 1))\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; /* Kernel-side variant of ip_tunnel_parm */ struct ip_tunnel_parm_kern { char name[IFNAMSIZ]; IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; int link; struct iphdr iph; }; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_set_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); ip_tunnel_flags_or(flags, flags, present); } static inline void ip_tunnel_clear_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); __ipt_flag_op(bitmap_andnot, flags, flags, present); } static inline bool ip_tunnel_is_options_present(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); return ip_tunnel_flags_intersect(flags, present); } static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(supp) = { }; bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); __set_bit(IP_TUNNEL_VTI_BIT, supp); return ip_tunnel_flags_subset(flags, supp); } static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) { ip_tunnel_flags_zero(dst); bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); } static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) { __be16 ret; ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); if (test_bit(IP_TUNNEL_VTI_BIT, flags)) ret |= VTI_ISVTI; return ret; } static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline bool pskb_inet_may_pull(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull(skb, nhlen); } /* Variant of pskb_inet_may_pull(). */ static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. */ if (eth_type_vlan(type)) type = __vlan_get_protocol(skb, type, &maclen); switch (type) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; } /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ if (!pskb_may_pull(skb, maclen + nhlen)) return false; skb_set_network_header(skb, maclen); return true; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IPV6)) return ip6_flowlabel((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, info + 1, info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, flags); } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */
23 1 61 13 5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* * Compatibility functions which bloat the callers too much to make inline. * All of the callers of these functions should be converted to use folios * eventually. */ #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include "internal.h" struct address_space *page_mapping(struct page *page) { return folio_mapping(page_folio(page)); } EXPORT_SYMBOL(page_mapping); void unlock_page(struct page *page) { return folio_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page); void end_page_writeback(struct page *page) { return folio_end_writeback(page_folio(page)); } EXPORT_SYMBOL(end_page_writeback); void wait_on_page_writeback(struct page *page) { return folio_wait_writeback(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_on_page_writeback); void wait_for_stable_page(struct page *page) { return folio_wait_stable(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_for_stable_page); void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); } EXPORT_SYMBOL(mark_page_accessed); void set_page_writeback(struct page *page) { folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); bool set_page_dirty(struct page *page) { return folio_mark_dirty(page_folio(page)); } EXPORT_SYMBOL(set_page_dirty); bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); } EXPORT_SYMBOL(clear_page_dirty_for_io); bool redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { return folio_redirty_for_writepage(wbc, page_folio(page)); } EXPORT_SYMBOL(redirty_page_for_writepage); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { return filemap_add_folio(mapping, page_folio(page), index, gfp); } EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); bool isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return false; return folio_isolate_lru((struct folio *)page); } void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); }
9 9 9 9 9 9 9 35 26 23 26 2 2 2 26 32 33 2 2 2 2 13 13 13 33 32 33 32 21 21 18 18 19 19 18 3 14 27 25 25 10 10 8 2 8 8 8 27 20 20 19 13 13 13 20 20 20 1 1 18 19 19 19 18 19 19 16 19 3 19 2 20 18 20 20 20 19 3 3 19 19 19 19 19 18 19 19 18 18 19 2 19 2 19 5 19 17 19 2 19 2 18 6 19 18 19 11 11 11 3 11 13 12 20 20 20 20 20 20 20 20 20 20 12 3 13 13 12 14 14 20 4 4 20 15 1 1 21 1 21 16 15 15 15 15 15 14 16 7 2 5 17 16 16 16 7 17 12 12 10 10 5 10 13 17 17 17 16 15 1 16 17 13 13 12 12 13 13 1 18 8 7 17 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. */ if (rt_task(current)) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, busy_flag); mask = vfs_poll(f.file, wait); fdput(f); } if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask = 0, filter; struct fd f; if (fd < 0) goto out; mask = EPOLLNVAL; f = fdget(fd); if (!f.file) goto out; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(f.file, pwait); if (mask & busy_flag) *can_busy_poll = true; mask &= filter; /* Mask out unneeded events. */ fdput(f); out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
1 1 3 4 3 3 3 1 1 1 1 1 1 2 2 5 2 5 5 5 2 2 2 7 7 3 3 7 6 6 4 1 1 3 3 3 3 3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ #include <linux/skmsg.h> #include <linux/bpf.h> #include <net/sock.h> #include <net/af_unix.h> #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ !list_empty(&__psock->ingress_msg); \ }) static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct unix_sock *u = unix_sk(sk); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); if (!unix_sk_has_data(sk, psock)) { mutex_unlock(&u->iolock); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); mutex_lock(&u->iolock); ret = unix_sk_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { if (sk->sk_type == SOCK_DGRAM) return __unix_dgram_recvmsg(sk, msg, len, flags); else return __unix_stream_recvmsg(sk, msg, len, flags); } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; int copied; if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = unix_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return copied; } static struct proto *unix_dgram_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_dgram_prot_lock); static struct proto unix_dgram_bpf_prot; static struct proto *unix_stream_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_stream_prot_lock); static struct proto unix_stream_bpf_prot; static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { spin_lock_bh(&unix_dgram_prot_lock); if (likely(ops != unix_dgram_prot_saved)) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); smp_store_release(&unix_dgram_prot_saved, ops); } spin_unlock_bh(&unix_dgram_prot_lock); } } static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { spin_lock_bh(&unix_stream_prot_lock); if (likely(ops != unix_stream_prot_saved)) { unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); smp_store_release(&unix_stream_prot_saved, ops); } spin_unlock_bh(&unix_stream_prot_lock); } } int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_dgram_bpf_prot); return 0; } int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct sock *sk_pair; /* Restore does not decrement the sk_pair reference yet because we must * keep the a reference to the socket until after an RCU grace period * and any pending sends have completed. */ if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } /* psock_update_sk_prot can be called multiple times if psock is * added to multiple maps and/or slots in the same map. There is * also an edge case where replacing a psock with itself can trigger * an extra psock_update_sk_prot during the insert process. So it * must be safe to do multiple calls. Here we need to ensure we don't * increment the refcnt through sock_hold many times. There will only * be a single matching destroy operation. */ if (!psock->sk_pair) { sk_pair = unix_peer(sk); sock_hold(sk_pair); psock->sk_pair = sk_pair; } unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); }
11 2 2 2 2 7 13 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_NETDEV_H_ #define _NETFILTER_NETDEV_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } #endif /* CONFIG_NETFILTER_INGRESS */ #ifdef CONFIG_NETFILTER_EGRESS static inline bool nf_hook_egress_active(void) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) return false; #endif return true; } /** * nf_hook_egress - classify packets before transmission * @skb: packet to be classified * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. * On egress, the order is reversed for symmetry. Conceptually, tc and * netfilter can be thought of as layers, with netfilter layered above tc: * When tc redirects a packet to another interface, netfilter is not applied * because the packet is on the tc layer. * * The nf_skip_egress flag controls whether netfilter is applied on egress. * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the * packet passes through tc and netfilter. Because __dev_queue_xmit() may be * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { struct nf_hook_entries *e; struct nf_hook_state state; int ret; #ifdef CONFIG_NETFILTER_SKIP_EGRESS if (skb->nf_skip_egress) return skb; #endif e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, NULL, dev, NULL, dev_net(dev), NULL); /* nf assumes rcu_read_lock, not just read_lock_bh */ rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); rcu_read_unlock(); if (ret == 1) { return skb; } else if (ret < 0) { *rc = NET_XMIT_DROP; return NULL; } else { /* ret == 0 */ *rc = NET_XMIT_SUCCESS; return NULL; } } #else /* CONFIG_NETFILTER_EGRESS */ static inline bool nf_hook_egress_active(void) { return false; } static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { return skb; } #endif /* CONFIG_NETFILTER_EGRESS */ static inline void nf_skip_egress(struct sk_buff *skb, bool skip) { #ifdef CONFIG_NETFILTER_SKIP_EGRESS skb->nf_skip_egress = skip; #endif } static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif #ifdef CONFIG_NETFILTER_EGRESS RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); #endif } #endif /* _NETFILTER_NETDEV_H_ */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0 #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H #include <linux/file.h> #include <linux/io_uring_types.h> bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, unsigned int file_slot); int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { WARN_ON_ONCE(!test_bit(bit, table->bitmap)); __clear_bit(bit, table->bitmap); table->alloc_hint = bit; } static inline void io_file_bitmap_set(struct io_file_table *table, int bit) { WARN_ON_ONCE(test_bit(bit, table->bitmap)); __set_bit(bit, table->bitmap); table->alloc_hint = bit + 1; } static inline struct io_fixed_file * io_fixed_file_slot(struct io_file_table *table, unsigned i) { return &table->files[i]; } #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline unsigned int io_slot_flags(struct io_fixed_file *slot) { return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; } static inline struct file *io_slot_file(struct io_fixed_file *slot) { return (struct file *)(slot->file_ptr & FFS_MASK); } static inline struct file *io_file_from_index(struct io_file_table *table, int index) { return io_slot_file(io_fixed_file_slot(table, index)); } static inline void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) { file_slot->file_ptr = (unsigned long)file | (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) { ctx->file_table.alloc_hint = ctx->file_alloc_start; } static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, unsigned off, unsigned len) { ctx->file_alloc_start = off; ctx->file_alloc_end = off + len; io_reset_alloc_hint(ctx); } #endif
2 2 16 2 2 16 2 2 2 2 2 16 2 16 8 8 11 11 9 9 2 2 2 2 2 2 2 2 2 2 11 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Disk events - monitor disk events like media change and eject request. */ #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/blkdev.h> #include "blk.h" struct disk_events { struct list_head node; /* all disk_event's */ struct gendisk *disk; /* the associated disk */ spinlock_t lock; struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ long poll_msecs; /* interval, -1 for default */ struct delayed_work dwork; }; static const char *disk_events_strs[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", }; static char *disk_uevents[] = { [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", }; /* list of all disk_events */ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { struct disk_events *ev = disk->ev; long intv_msecs = 0; /* * If device-specific poll interval is set, always use it. If * the default is being used, poll if the POLL flag is set. */ if (ev->poll_msecs >= 0) intv_msecs = ev->poll_msecs; else if (disk->event_flags & DISK_EVENT_FLAG_POLL) intv_msecs = disk_events_dfl_poll_msecs; return msecs_to_jiffies(intv_msecs); } /** * disk_block_events - block and flush disk event checking * @disk: disk to block events for * * On return from this function, it is guaranteed that event checking * isn't in progress and won't happen until unblocked by * disk_unblock_events(). Events blocking is counted and the actual * unblocking happens after the matching number of unblocks are done. * * Note that this intentionally does not block event checking from * disk_clear_events(). * * CONTEXT: * Might sleep. */ void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; if (!ev) return; /* * Outer mutex ensures that the first blocker completes canceling * the event work before further blockers are allowed to finish. */ mutex_lock(&ev->block_mutex); spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); if (cancel) cancel_delayed_work_sync(&disk->ev->dwork); mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) { struct disk_events *ev = disk->ev; unsigned long intv; unsigned long flags; spin_lock_irqsave(&ev->lock, flags); if (WARN_ON_ONCE(ev->block <= 0)) goto out_unlock; if (--ev->block) goto out_unlock; intv = disk_events_poll_jiffies(disk); if (check_now) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); else if (intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } /** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for * * Undo disk_block_events(). When the block count reaches zero, it * starts events polling if configured. * * CONTEXT: * Don't care. Safe to call from irq context. */ void disk_unblock_events(struct gendisk *disk) { if (disk->ev) __disk_unblock_events(disk, false); } /** * disk_flush_events - schedule immediate event checking and flushing * @disk: disk to check and flush events for * @mask: events to flush * * Schedule immediate event checking on @disk if not blocked. Events in * @mask are scheduled to be cleared from the driver. Note that this * doesn't clear the events from @disk->ev. * * CONTEXT: * If @mask is non-zero must be called with disk->open_mutex held. */ void disk_flush_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; if (!ev) return; spin_lock_irq(&ev->lock); ev->clearing |= mask; if (!ev->block) mod_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); spin_unlock_irq(&ev->lock); } /* * Tell userland about new events. Only the events listed in @disk->events are * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are * processed internally but never get reported to userland. */ static void disk_event_uevent(struct gendisk *disk, unsigned int events) { char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; int nr_events = 0, i; for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); } static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; /* check events */ events = disk->fops->check_events(disk, clearing); /* accumulate pending events and schedule next poll if necessary */ spin_lock_irq(&ev->lock); events &= ~ev->pending; ev->pending |= events; *clearing_ptr &= ~clearing; intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) disk_event_uevent(disk, events); } /** * disk_clear_events - synchronously check, clear and return pending events * @disk: disk to fetch and clear events from * @mask: mask of events to be fetched and cleared * * Disk events are synchronously checked and pending events in @mask * are cleared and returned. This ignores the block count. * * CONTEXT: * Might sleep. */ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; if (!ev) return 0; disk_block_events(disk); /* * store the union of mask and ev->clearing on the stack so that the * race with disk_flush_events does not cause ambiguity (ev->clearing * can still be modified even if events are blocked). */ spin_lock_irq(&ev->lock); clearing |= ev->clearing; ev->clearing = 0; spin_unlock_irq(&ev->lock); disk_check_events(ev, &clearing); /* * if ev->clearing is not 0, the disk_flush_events got called in the * middle of this function, so we want to run the workfn without delay. */ __disk_unblock_events(disk, ev->clearing ? true : false); /* then, fetch and clear pending events */ spin_lock_irq(&ev->lock); pending = ev->pending & mask; ev->pending &= ~mask; spin_unlock_irq(&ev->lock); WARN_ON_ONCE(clearing & mask); return pending; } /** * disk_check_media_change - check if a removable media has been changed * @disk: gendisk to check * * Returns %true and marks the disk for a partition rescan whether a removable * media has been changed, and %false if the media did not change. */ bool disk_check_media_change(struct gendisk *disk) { unsigned int events; events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); if (events & DISK_EVENT_MEDIA_CHANGE) { set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } return false; } EXPORT_SYMBOL(disk_check_media_change); /** * disk_force_media_change - force a media change event * @disk: the disk which will raise the event * * Should be called when the media changes for @disk. Generates a uevent * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. */ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. */ static void disk_events_workfn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct disk_events *ev = container_of(dwork, struct disk_events, dwork); disk_check_events(ev, &ev->clearing); } /* * A disk events enabled device has the following sysfs nodes under * its /sys/block/X/ directory. * * events : list of all supported events * events_async : list of events which can be detected w/o polling * (always empty, only for backwards compatibility) * events_poll_msecs : polling interval, 0: disable, -1: system default */ static ssize_t __disk_events_show(unsigned int events, char *buf) { const char *delim = ""; ssize_t pos = 0; int i; for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) if (events & (1 << i)) { pos += sprintf(buf + pos, "%s%s", delim, disk_events_strs[i]); delim = " "; } if (pos) pos += sprintf(buf + pos, "\n"); return pos; } static ssize_t disk_events_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) return 0; return __disk_events_show(disk->events, buf); } static ssize_t disk_events_async_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static ssize_t disk_events_poll_msecs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); if (!disk->ev) return sprintf(buf, "-1\n"); return sprintf(buf, "%ld\n", disk->ev->poll_msecs); } static ssize_t disk_events_poll_msecs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); long intv; if (!count || !sscanf(buf, "%ld", &intv)) return -EINVAL; if (intv < 0 && intv != -1) return -EINVAL; if (!disk->ev) return -ENODEV; disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); return count; } DEVICE_ATTR(events, 0444, disk_events_show, NULL); DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, disk_events_poll_msecs_store); /* * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) { struct disk_events *ev; int ret; ret = param_set_ulong(val, kp); if (ret < 0) return ret; mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); return 0; } static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { .set = disk_events_set_dfl_poll_msecs, .get = param_get_ulong, }; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "block." module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); return -ENOMEM; } INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; return 0; } void disk_add_events(struct gendisk *disk) { if (!disk->ev) return; mutex_lock(&disk_events_mutex); list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* * Block count is initialized to 1 and the following initial * unblock kicks it into action. */ __disk_unblock_events(disk, true); } void disk_del_events(struct gendisk *disk) { if (disk->ev) { disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); mutex_unlock(&disk_events_mutex); } } void disk_release_events(struct gendisk *disk) { /* the block count should be 1 from disk_del_events() */ WARN_ON_ONCE(disk->ev && disk->ev->block != 1); kfree(disk->ev); }
1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 // SPDX-License-Identifier: GPL-2.0-or-later /* Sysfs attributes of bond slaves * * Copyright (c) 2014 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/bonding.h> struct slave_attribute { struct attribute attr; ssize_t (*show)(struct slave *, char *); }; #define SLAVE_ATTR_RO(_name) \ const struct slave_attribute slave_attr_##_name = __ATTR_RO(_name) static ssize_t state_show(struct slave *slave, char *buf) { switch (bond_slave_state(slave)) { case BOND_STATE_ACTIVE: return sysfs_emit(buf, "active\n"); case BOND_STATE_BACKUP: return sysfs_emit(buf, "backup\n"); default: return sysfs_emit(buf, "UNKNOWN\n"); } } static SLAVE_ATTR_RO(state); static ssize_t mii_status_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%s\n", bond_slave_link_status(slave->link)); } static SLAVE_ATTR_RO(mii_status); static ssize_t link_failure_count_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", slave->link_failure_count); } static SLAVE_ATTR_RO(link_failure_count); static ssize_t perm_hwaddr_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%*phC\n", slave->dev->addr_len, slave->perm_hwaddr); } static SLAVE_ATTR_RO(perm_hwaddr); static ssize_t queue_id_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(slave->queue_id)); } static SLAVE_ATTR_RO(queue_id); static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) { const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) return sysfs_emit(buf, "%d\n", agg->aggregator_identifier); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_aggregator_id); static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_actor_oper_port_state); static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_partner_oper_port_state); static const struct attribute *slave_attrs[] = { &slave_attr_state.attr, &slave_attr_mii_status.attr, &slave_attr_link_failure_count.attr, &slave_attr_perm_hwaddr.attr, &slave_attr_queue_id.attr, &slave_attr_ad_aggregator_id.attr, &slave_attr_ad_actor_oper_port_state.attr, &slave_attr_ad_partner_oper_port_state.attr, NULL }; #define to_slave_attr(_at) container_of(_at, struct slave_attribute, attr) static ssize_t slave_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct slave_attribute *slave_attr = to_slave_attr(attr); struct slave *slave = to_slave(kobj); return slave_attr->show(slave, buf); } const struct sysfs_ops slave_sysfs_ops = { .show = slave_show, }; int bond_sysfs_slave_add(struct slave *slave) { return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { sysfs_remove_files(&slave->kobj, slave_attrs); }
3 3 1 2 2 2 2 2 2 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2002 USAGI/WIDE Project * * Authors * * Mitsuru KANDA @USAGI : IPv6 Support * Kazunori MIYAZAWA @USAGI : * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * * This file is derived from net/ipv4/ah.c. */ #define pr_fmt(fmt) "IPv6: " fmt #include <crypto/hash.h> #include <crypto/utils.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/string.h> #include <linux/scatterlist.h> #include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> #define IPV6HDR_BASELEN 8 struct tmp_ext { #if IS_ENABLED(CONFIG_IPV6_MIP6) struct in6_addr saddr; #endif struct in6_addr daddr; char hdrs[]; }; struct ah_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; }; #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; len = size + crypto_ahash_digestsize(ahash); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; return kmalloc(len, GFP_ATOMIC); } static inline struct tmp_ext *ah_tmp_ext(void *base) { return base + IPV6HDR_BASELEN; } static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset) { return tmp + offset; } static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset) { return tmp + offset; } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, u8 *icv) { struct ahash_request *req; req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), crypto_tfm_ctx_alignment()); ahash_request_set_tfm(req, ahash); return req; } static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, struct ahash_request *req) { return (void *)ALIGN((unsigned long)(req + 1) + crypto_ahash_reqsize(ahash), __alignof__(struct scatterlist)); } static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) { u8 *opt = (u8 *)opthdr; int len = ipv6_optlen(opthdr); int off = 0; int optlen = 0; off += 2; len -= 2; while (len > 0) { switch (opt[off]) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = opt[off+1]+2; if (len < optlen) goto bad; if (opt[off] & 0x20) memset(&opt[off+2], 0, opt[off+1]); break; } off += optlen; len -= optlen; } if (len == 0) return true; bad: return false; } #if IS_ENABLED(CONFIG_IPV6_MIP6) /** * ipv6_rearrange_destopt - rearrange IPv6 destination options header * @iph: IPv6 header * @destopt: destionation options header */ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) { u8 *opt = (u8 *)destopt; int len = ipv6_optlen(destopt); int off = 0; int optlen = 0; off += 2; len -= 2; while (len > 0) { switch (opt[off]) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = opt[off+1]+2; if (len < optlen) goto bad; /* Rearrange the source address in @iph and the * addresses in home address option for final source. * See 11.3.2 of RFC 3775 for details. */ if (opt[off] == IPV6_TLV_HAO) { struct ipv6_destopt_hao *hao; hao = (struct ipv6_destopt_hao *)&opt[off]; if (hao->length != sizeof(hao->addr)) { net_warn_ratelimited("destopt hao: invalid header length: %u\n", hao->length); goto bad; } swap(hao->addr, iph->saddr); } break; } off += optlen; len -= optlen; } /* Note: ok if len == 0 */ bad: return; } #else static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {} #endif /** * ipv6_rearrange_rthdr - rearrange IPv6 routing header * @iph: IPv6 header * @rthdr: routing header * * Rearrange the destination address in @iph and the addresses in @rthdr * so that they appear in the order they will at the final destination. * See Appendix A2 of RFC 2402 for details. */ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) { int segments, segments_left; struct in6_addr *addrs; struct in6_addr final_addr; segments_left = rthdr->segments_left; if (segments_left == 0) return; rthdr->segments_left = 0; /* The value of rthdr->hdrlen has been verified either by the system * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming * packets. So we can assume that it is even and that segments is * greater than or equal to segments_left. * * For the same reason we can assume that this option is of type 0. */ segments = rthdr->hdrlen >> 1; addrs = ((struct rt0_hdr *)rthdr)->addr; final_addr = addrs[segments - 1]; addrs += segments - segments_left; memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); addrs[0] = iph->daddr; iph->daddr = final_addr; } static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) { union { struct ipv6hdr *iph; struct ipv6_opt_hdr *opth; struct ipv6_rt_hdr *rth; char *raw; } exthdr = { .iph = iph }; char *end = exthdr.raw + len; int nexthdr = iph->nexthdr; exthdr.iph++; while (exthdr.raw < end) { switch (nexthdr) { case NEXTHDR_DEST: if (dir == XFRM_POLICY_OUT) ipv6_rearrange_destopt(iph, exthdr.opth); fallthrough; case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { net_dbg_ratelimited("overrun %sopts\n", nexthdr == NEXTHDR_HOP ? "hop" : "dest"); return -EINVAL; } break; case NEXTHDR_ROUTING: ipv6_rearrange_rthdr(iph, exthdr.rth); break; default: return 0; } nexthdr = exthdr.opth->nexthdr; exthdr.raw += ipv6_optlen(exthdr.opth); } return 0; } static void ah6_output_done(void *data, int err) { int extlen; u8 *iph_base; u8 *icv; struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct ipv6hdr *top_iph = ipv6_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); struct tmp_ext *iph_ext; extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); if (extlen) extlen += sizeof(*iph_ext); iph_base = AH_SKB_CB(skb)->tmp; iph_ext = ah_tmp_ext(iph_base); icv = ah_tmp_icv(iph_ext, extlen); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); if (extlen) { #if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(&top_iph->saddr, iph_ext, extlen); #else memcpy(&top_iph->daddr, iph_ext, extlen); #endif } kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); } static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; int nfrags; int extlen; u8 *iph_base; u8 *icv; u8 nexthdr; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct ipv6hdr *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; struct tmp_ext *iph_ext; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; err = skb_cow_data(skb, 0, &trailer); if (err < 0) goto out; nfrags = err; skb_push(skb, -skb_network_offset(skb)); extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); if (extlen) extlen += sizeof(*iph_ext); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } err = -ENOMEM; iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN + extlen + seqhi_len); if (!iph_base) goto out; iph_ext = ah_tmp_ext(iph_base); seqhi = (__be32 *)((char *)iph_ext + extlen); icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; ah = ip_auth_hdr(skb); memset(ah->auth_data, 0, ahp->icv_trunc_len); top_iph = ipv6_hdr(skb); top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; /* When there are no extension headers, we only need to save the first * 8 bytes of the base IP header. */ memcpy(iph_base, top_iph, IPV6HDR_BASELEN); if (extlen) { #if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(iph_ext, &top_iph->saddr, extlen); #else memcpy(iph_ext, &top_iph->daddr, extlen); #endif err = ipv6_clear_mutable_options(top_iph, extlen - sizeof(*iph_ext) + sizeof(*top_iph), XFRM_POLICY_OUT); if (err) goto out_free; } ah->nexthdr = nexthdr; top_iph->priority = 0; top_iph->flow_lbl[0] = 0; top_iph->flow_lbl[1] = 0; top_iph->flow_lbl[2] = 0; top_iph->hop_limit = 0; ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_output_done, skb); AH_SKB_CB(skb)->tmp = iph_base; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); if (extlen) { #if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(&top_iph->saddr, iph_ext, extlen); #else memcpy(&top_iph->daddr, iph_ext, extlen); #endif } out_free: kfree(iph_base); out: return err; } static void ah6_input_done(void *data, int err) { u8 *auth_data; u8 *icv; u8 *work_iph; struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); int hdr_len = skb_network_header_len(skb); int ah_hlen = ipv6_authlen(ah); if (err) goto out; work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; err = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, hdr_len); __skb_pull(skb, ah_hlen + hdr_len); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); } static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) { /* * Before process AH * [IPv6][Ext1][Ext2][AH][Dest][Payload] * |<-------------->| hdr_len * * To erase AH: * Keeping copy of cleared headers. After AH processing, * Moving the pointer of skb->network_header by using skb_pull as long * as AH header length. Then copy back the copy as long as hdr_len * If destination header following AH exists, copy it into after [Ext2]. * * |<>|[IPv6][Ext1][Ext2][Dest][Payload] * There is offset of AH before IPv6 header after the process. */ u8 *auth_data; u8 *icv; u8 *work_iph; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct ip_auth_hdr *ah; struct ipv6hdr *ip6h; struct ah_data *ahp; u16 hdr_len; u16 ah_hlen; int nexthdr; int nfrags; int err = -ENOMEM; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; hdr_len = skb_network_header_len(skb); ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ahash = ahp->ahash; nexthdr = ah->nexthdr; ah_hlen = ipv6_authlen(ah); if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; if (!pskb_may_pull(skb, ah_hlen)) goto out; err = skb_cow_data(skb, 0, &trailer); if (err < 0) goto out; nfrags = err; ah = (struct ip_auth_hdr *)skb->data; ip6h = ipv6_hdr(skb); skb_push(skb, hdr_len); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + ahp->icv_trunc_len + seqhi_len); if (!work_iph) { err = -ENOMEM; goto out; } auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memcpy(work_iph, ip6h, hdr_len); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN); if (err) goto out_free; ip6h->priority = 0; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->hop_limit = 0; sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; goto out_free; } err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, hdr_len); __skb_pull(skb, ah_hlen + hdr_len); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); err = nexthdr; out_free: kfree(work_iph); out: return err; } static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset); struct xfrm_state *x; if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); if (!x) return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; if (!x->aalg) { NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; } if (x->encap) { NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); if (IS_ERR(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here * after a successful crypto_alloc_hash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); switch (x->props.mode) { case XFRM_MODE_BEET: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; default: NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET"); goto error; } x->data = ahp; return 0; error: if (ahp) { crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; } static void ah6_destroy(struct xfrm_state *x) { struct ah_data *ahp = x->data; if (!ahp) return; crypto_free_ahash(ahp->ahash); kfree(ahp); } static int ah6_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ah6_type = { .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah6_init_state, .destructor = ah6_destroy, .input = ah6_input, .output = ah6_output, }; static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, .priority = 0, }; static int __init ah6_init(void) { if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah6_type, AF_INET6); return -EAGAIN; } return 0; } static void __exit ah6_fini(void) { if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ah6_type, AF_INET6); } module_init(ah6_init); module_exit(ah6_fini); MODULE_DESCRIPTION("IPv6 AH transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2013 Astaro GmbH & Co KG */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("Xtables: add/match connection tracking labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connlabel_mtinfo *info = par->matchinfo; enum ip_conntrack_info ctinfo; struct nf_conn_labels *labels; struct nf_conn *ct; bool invert = info->options & XT_CONNLABEL_OP_INVERT; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return invert; labels = nf_ct_labels_find(ct); if (!labels) return invert; if (test_bit(info->bit, labels->bits)) return !invert; if (info->options & XT_CONNLABEL_OP_SET) { if (!test_and_set_bit(info->bit, labels->bits)) nf_conntrack_event_cache(IPCT_LABEL, ct); return !invert; } return invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) { const int options = XT_CONNLABEL_OP_INVERT | XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; if (info->options & ~options) { pr_info_ratelimited("Unknown options in mask %x\n", info->options); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } ret = nf_connlabels_get(par->net, info->bit); if (ret < 0) nf_ct_netns_put(par->net, par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { nf_connlabels_put(par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_match connlabels_mt_reg __read_mostly = { .name = "connlabel", .family = NFPROTO_UNSPEC, .checkentry = connlabel_mt_check, .match = connlabel_mt, .matchsize = sizeof(struct xt_connlabel_mtinfo), .destroy = connlabel_mt_destroy, .me = THIS_MODULE, }; static int __init connlabel_mt_init(void) { return xt_register_match(&connlabels_mt_reg); } static void __exit connlabel_mt_exit(void) { xt_unregister_match(&connlabels_mt_reg); } module_init(connlabel_mt_init); module_exit(connlabel_mt_exit);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/mm.h> #include <linux/llist.h> #include <linux/bpf.h> #include <linux/irq_work.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <asm/local.h> /* Any context (including NMI) BPF specific memory allocator. * * Tracing BPF programs can attach to kprobe and fentry. Hence they * run in unknown context where calling plain kmalloc() might not be safe. * * Front-end kmalloc() with per-cpu per-bucket cache of free elements. * Refill this cache asynchronously from irq_work. * * CPU_0 buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * ... * CPU_N buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * * The buckets are prefilled at the start. * BPF programs always run with migration disabled. * It's safe to allocate from cache of the current cpu with irqs disabled. * Free-ing is always done into bucket of the current cpu as well. * irq_work trims extra free elements from buckets with kfree * and refills them with kmalloc, so global kmalloc logic takes care * of freeing objects allocated by one cpu and freed on another. * * Every allocated objected is padded with extra 8 bytes that contains * struct llist_node. */ #define LLIST_NODE_SZ sizeof(struct llist_node) /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ 3, /* 16 */ 4, /* 24 */ 4, /* 32 */ 5, /* 40 */ 5, /* 48 */ 5, /* 56 */ 5, /* 64 */ 1, /* 72 */ 1, /* 80 */ 1, /* 88 */ 1, /* 96 */ 6, /* 104 */ 6, /* 112 */ 6, /* 120 */ 6, /* 128 */ 2, /* 136 */ 2, /* 144 */ 2, /* 152 */ 2, /* 160 */ 2, /* 168 */ 2, /* 176 */ 2, /* 184 */ 2 /* 192 */ }; static int bpf_mem_cache_idx(size_t size) { if (!size || size > 4096) return -1; if (size <= 192) return size_index[(size - 1) / 8] - 1; return fls(size - 1) - 2; } #define NUM_CACHES 11 struct bpf_mem_cache { /* per-cpu list of free objects of size 'unit_size'. * All accesses are done with interrupts disabled and 'active' counter * protection with __llist_add() and __llist_del_first(). */ struct llist_head free_llist; local_t active; /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill * are sequenced by per-cpu 'active' counter. But unit_free() cannot * fail. When 'active' is busy the unit_free() will add an object to * free_llist_extra. */ struct llist_head free_llist_extra; struct irq_work refill_work; struct obj_cgroup *objcg; int unit_size; /* count of objects in free_llist */ int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; bool draining; struct bpf_mem_cache *tgt; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; struct llist_node *free_by_rcu_tail; struct llist_head waiting_for_gp; struct llist_node *waiting_for_gp_tail; struct rcu_head rcu; atomic_t call_rcu_in_progress; struct llist_head free_llist_extra_rcu; /* list of objects to be freed after RCU tasks trace GP */ struct llist_head free_by_rcu_ttrace; struct llist_head waiting_for_gp_ttrace; struct rcu_head rcu_ttrace; atomic_t call_rcu_ttrace_in_progress; }; struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = head->first; if (!entry) return NULL; next = entry->next; head->first = next; return entry; } static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { if (c->percpu_size) { void **obj = kmalloc_node(c->percpu_size, flags, node); void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); if (!obj || !pptr) { free_percpu(pptr); kfree(obj); return NULL; } obj[1] = pptr; return obj; } return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); } static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) { #ifdef CONFIG_MEMCG_KMEM if (c->objcg) return get_mem_cgroup_from_objcg(c->objcg); #endif #ifdef CONFIG_MEMCG return root_mem_cgroup; #else return NULL; #endif } static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and * reduce the chance of bpf prog executing on this cpu * when active counter is busy. */ local_irq_save(*flags); /* alloc_bulk runs from irq_work which will not preempt a bpf * program that does unit_alloc/unit_free since IRQs are * disabled there. There is no race to increment 'active' * counter. It protects free_llist from corruption in case NMI * bpf prog preempted this loop. */ WARN_ON_ONCE(local_inc_return(&c->active) != 1); } static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) { local_dec(&c->active); if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) { unsigned long flags; inc_active(c, &flags); __llist_add(obj, &c->free_llist); c->free_cnt++; dec_active(c, &flags); } /* Mostly runs from irq_work except __init phase. */ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) { struct mem_cgroup *memcg = NULL, *old_memcg; gfp_t gfp; void *obj; int i; gfp = __GFP_NOWARN | __GFP_ACCOUNT; gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; for (i = 0; i < cnt; i++) { /* * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is * done only by one CPU == current CPU. Other CPUs might * llist_add() and llist_del_all() in parallel. */ obj = llist_del_first(&c->free_by_rcu_ttrace); if (!obj) break; add_obj_to_free_list(c, obj); } if (i >= cnt) return; for (; i < cnt; i++) { obj = llist_del_first(&c->waiting_for_gp_ttrace); if (!obj) break; add_obj_to_free_list(c, obj); } if (i >= cnt) return; memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (; i < cnt; i++) { /* Allocate, but don't deplete atomic reserves that typical * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc * will allocate from the current numa node which is what we * want here. */ obj = __alloc(c, node, gfp); if (!obj) break; add_obj_to_free_list(c, obj); } set_active_memcg(old_memcg); mem_cgroup_put(memcg); } static void free_one(void *obj, bool percpu) { if (percpu) { free_percpu(((void **)obj)[1]); kfree(obj); return; } kfree(obj); } static int free_all(struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; int cnt = 0; llist_for_each_safe(pos, t, llnode) { free_one(pos, percpu); cnt++; } return cnt; } static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); atomic_set(&c->call_rcu_ttrace_in_progress, 0); } static void __free_rcu_tasks_trace(struct rcu_head *head) { /* If RCU Tasks Trace grace period implies RCU grace period, * there is no need to invoke call_rcu(). */ if (rcu_trace_implies_rcu_gp()) __free_rcu(head); else call_rcu(head, __free_rcu); } static void enque_to_free(struct bpf_mem_cache *c, void *obj) { struct llist_node *llnode = obj; /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. * Nothing races to add to free_by_rcu_ttrace list. */ llist_add(llnode, &c->free_by_rcu_ttrace); } static void do_call_rcu_ttrace(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { if (unlikely(READ_ONCE(c->draining))) { llnode = llist_del_all(&c->free_by_rcu_ttrace); free_all(llnode, !!c->percpu_size); } return; } WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) llist_add(llnode, &c->waiting_for_gp_ttrace); if (unlikely(READ_ONCE(c->draining))) { __free_rcu(&c->rcu_ttrace); return; } /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * If RCU Tasks Trace grace period implies RCU grace period, free * these elements directly, else use call_rcu() to wait for normal * progs to finish and finally do free_one() on each element. */ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); } static void free_bulk(struct bpf_mem_cache *c) { struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode, *t; unsigned long flags; int cnt; WARN_ON_ONCE(tgt->unit_size != c->unit_size); WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); do { inc_active(c, &flags); llnode = __llist_del_first(&c->free_llist); if (llnode) cnt = --c->free_cnt; else cnt = 0; dec_active(c, &flags); if (llnode) enque_to_free(tgt, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) enque_to_free(tgt, llnode); do_call_rcu_ttrace(tgt); } static void __free_by_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode; WARN_ON_ONCE(tgt->unit_size != c->unit_size); WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); /* Objects went through regular RCU GP. Send them to RCU tasks trace */ do_call_rcu_ttrace(tgt); out: atomic_set(&c->call_rcu_in_progress, 0); } static void check_free_by_rcu(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; unsigned long flags; /* drain free_llist_extra_rcu */ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { inc_active(c, &flags); llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) if (__llist_add(llnode, &c->free_by_rcu)) c->free_by_rcu_tail = llnode; dec_active(c, &flags); } if (llist_empty(&c->free_by_rcu)) return; if (atomic_xchg(&c->call_rcu_in_progress, 1)) { /* * Instead of kmalloc-ing new rcu_head and triggering 10k * call_rcu() to hit rcutree.qhimark and force RCU to notice * the overload just ask RCU to hurry up. There could be many * objects in free_by_rcu list. * This hint reduces memory consumption for an artificial * benchmark from 2 Gbyte to 150 Mbyte. */ rcu_request_urgent_qs_task(current); return; } WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); inc_active(c, &flags); WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); c->waiting_for_gp_tail = c->free_by_rcu_tail; dec_active(c, &flags); if (unlikely(READ_ONCE(c->draining))) { free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } else { call_rcu_hurry(&c->rcu, __free_by_rcu); } } static void bpf_mem_refill(struct irq_work *work) { struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); int cnt; /* Racy access to free_cnt. It doesn't need to be 100% accurate */ cnt = c->free_cnt; if (cnt < c->low_watermark) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here. */ alloc_bulk(c, c->batch, NUMA_NO_NODE, true); else if (cnt > c->high_watermark) free_bulk(c); check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) { irq_work_queue(&c->refill_work); } /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket * the freelist cache will be elem_size * 64 (or less) on each cpu. * * For bpf programs that don't have statically known allocation sizes and * assuming (low_mark + high_mark) / 2 as an average number of elements per * bucket and all buckets are used the total amount of memory in freelists * on each cpu will be: * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 * == ~ 116 Kbyte using below heuristic. * Initialized, but unused bpf allocator (not bpf map specific one) will * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. * * Percpu allocation is typically rare. To avoid potential unnecessary large * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); if (c->percpu_size) { c->low_watermark = 1; c->high_watermark = 3; } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { /* When page_size == 4k, order-0 cache will have low_mark == 2 * and high_mark == 6 with batch alloc of 3 individual pages at * a time. * 8k allocs and above low == 1, high == 3, batch == 1. */ c->low_watermark = max(32 * 256 / c->unit_size, 1); c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); } static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { int cnt = 1; /* To avoid consuming memory, for non-percpu allocation, assume that * 1st run of bpf prog won't be doing more than 4 map_update_elem from * irq disabled region if unit size is less than or equal to 256. * For all other cases, let us just do one allocation. */ if (!c->percpu_size && c->unit_size <= 256) cnt = 4; alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on * kmalloc/kfree. Max allocation size is 4096 in this case. * This is bpf_dynptr and bpf_kptr use case. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; if (percpu && size == 0) return -EINVAL; /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); ma->percpu = percpu; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; #ifdef CONFIG_MEMCG_KMEM if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; return 0; } pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; init_refill_work(c); prefill_mem_cache(c, cpu); } } ma->caches = pcc; return 0; } int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) { struct bpf_mem_caches __percpu *pcc; pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; ma->caches = pcc; ma->objcg = objcg; ma->percpu = true; return 0; } int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) { struct bpf_mem_caches *cc, __percpu *pcc; int cpu, i, unit_size, percpu_size; struct obj_cgroup *objcg; struct bpf_mem_cache *c; i = bpf_mem_cache_idx(size); if (i < 0) return -EINVAL; /* room for llist_node and per-cpu pointer */ percpu_size = LLIST_NODE_SZ + sizeof(void *); unit_size = sizes[i]; objcg = ma->objcg; pcc = ma->caches; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); c = &cc->cache[i]; if (c->unit_size) break; c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; init_refill_work(c); prefill_mem_cache(c, cpu); } return 0; } static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. * * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); free_all(__llist_del_all(&c->free_by_rcu), percpu); free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); free_all(llist_del_all(&c->waiting_for_gp), percpu); } static void check_mem_cache(struct bpf_mem_cache *c) { WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); WARN_ON_ONCE(!llist_empty(&c->free_llist)); WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); } static void check_leaked_objs(struct bpf_mem_alloc *ma) { struct bpf_mem_caches *cc; struct bpf_mem_cache *c; int cpu, i; if (ma->cache) { for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); check_mem_cache(c); } } if (ma->caches) { for_each_possible_cpu(cpu) { cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; check_mem_cache(c); } } } } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); ma->cache = NULL; ma->caches = NULL; } static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), * so if call_rcu(head, __free_rcu) is skipped due to * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier(); /* wait for __free_by_rcu */ rcu_barrier_tasks_trace(); /* wait for __free_rcu */ if (!rcu_trace_implies_rcu_gp()) rcu_barrier(); free_mem_alloc_no_barrier(ma); } static void free_mem_alloc_deferred(struct work_struct *work) { struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); free_mem_alloc(ma); kfree(ma); } static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) { struct bpf_mem_alloc *copy; if (!rcu_in_progress) { /* Fast path. No callbacks are pending, hence no need to do * rcu_barrier-s. */ free_mem_alloc_no_barrier(ma); return; } copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); if (!copy) { /* Slow path with inline barrier-s */ free_mem_alloc(ma); return; } /* Defer barriers into worker to let the rest of map memory to be freed */ memset(ma, 0, sizeof(*ma)); INIT_WORK(&copy->work, free_mem_alloc_deferred); queue_work(system_unbound_wq, &copy->work); } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) { struct bpf_mem_caches *cc; struct bpf_mem_cache *c; int cpu, i, rcu_in_progress; if (ma->cache) { rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { rcu_in_progress = 0; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } /* notrace is necessary here and in other functions to make sure * bpf programs cannot attach to them and cause llist corruptions. */ static void notrace *unit_alloc(struct bpf_mem_cache *c) { struct llist_node *llnode = NULL; unsigned long flags; int cnt = 0; /* Disable irqs to prevent the following race for majority of prog types: * prog_A * bpf_mem_alloc * preemption or irq -> prog_B * bpf_mem_alloc * * but prog_B could be a perf_event NMI prog. * Use per-cpu 'active' counter to order free_list access between * unit_alloc/unit_free/bpf_mem_refill. */ local_irq_save(flags); if (local_inc_return(&c->active) == 1) { llnode = __llist_del_first(&c->free_llist); if (llnode) { cnt = --c->free_cnt; *(struct bpf_mem_cache **)llnode = c; } } local_dec(&c->active); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); /* Enable IRQ after the enqueue of irq work completes, so irq work * will run after IRQ is enabled and free_llist may be refilled by * irq work before other task preempts current task. */ local_irq_restore(flags); return llnode; } /* Though 'ptr' object could have been allocated on a different cpu * add it to the free_llist of the current cpu. * Let kfree() logic deal with it when it's later called from irq_work. */ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) { struct llist_node *llnode = ptr - LLIST_NODE_SZ; unsigned long flags; int cnt = 0; BUILD_BUG_ON(LLIST_NODE_SZ > 8); /* * Remember bpf_mem_cache that allocated this object. * The hint is not accurate. */ c->tgt = *(struct bpf_mem_cache **)llnode; local_irq_save(flags); if (local_inc_return(&c->active) == 1) { __llist_add(llnode, &c->free_llist); cnt = ++c->free_cnt; } else { /* unit_free() cannot fail. Therefore add an object to atomic * llist. free_bulk() will drain it. Though free_llist_extra is * a per-cpu list we have to use atomic llist_add here, since * it also can be interrupted by bpf nmi prog that does another * unit_free() into the same free_llist_extra. */ llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); /* Enable IRQ after irq_work_raise() completes, otherwise when current * task is preempted by task which does unit_alloc(), unit_alloc() may * return NULL unexpectedly because irq work is already pending but can * not been triggered and free_llist can not be refilled timely. */ local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) { struct llist_node *llnode = ptr - LLIST_NODE_SZ; unsigned long flags; c->tgt = *(struct bpf_mem_cache **)llnode; local_irq_save(flags); if (local_inc_return(&c->active) == 1) { if (__llist_add(llnode, &c->free_by_rcu)) c->free_by_rcu_tail = llnode; } else { llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) { int idx; void *ret; if (!size) return NULL; if (!ma->percpu) size += LLIST_NODE_SZ; idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); return !ret ? NULL : ret + LLIST_NODE_SZ; } void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { struct bpf_mem_cache *c; int idx; if (!ptr) return; c = *(void **)(ptr - LLIST_NODE_SZ); idx = bpf_mem_cache_idx(c->unit_size); if (WARN_ON_ONCE(idx < 0)) return; unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { struct bpf_mem_cache *c; int idx; if (!ptr) return; c = *(void **)(ptr - LLIST_NODE_SZ); idx = bpf_mem_cache_idx(c->unit_size); if (WARN_ON_ONCE(idx < 0)) return; unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); } void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; ret = unit_alloc(this_cpu_ptr(ma->cache)); return !ret ? NULL : ret + LLIST_NODE_SZ; } void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) { if (!ptr) return; unit_free(this_cpu_ptr(ma->cache), ptr); } void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { if (!ptr) return; unit_free_rcu(this_cpu_ptr(ma->cache), ptr); } /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' * before calling bpf_mem_cache_raw_free(). * It could be used when the rcu_tasks_trace callback does not have * a hold on the original bpf_mem_alloc object that allocated the * 'ptr'. This should only be used in the uncommon code path. * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled * and may affect performance. */ void bpf_mem_cache_raw_free(void *ptr) { if (!ptr) return; kfree(ptr - LLIST_NODE_SZ); } /* When flags == GFP_KERNEL, it signals that the caller will not cause * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use * kmalloc if the free_llist is empty. */ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) { struct bpf_mem_cache *c; void *ret; c = this_cpu_ptr(ma->cache); ret = unit_alloc(c); if (!ret && flags == GFP_KERNEL) { struct mem_cgroup *memcg, *old_memcg; memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); if (ret) *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } return !ret ? NULL : ret + LLIST_NODE_SZ; }
72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_MROUTE_H #define __LINUX_MROUTE_H #include <linux/in.h> #include <linux/pim.h> #include <net/fib_rules.h> #include <net/fib_notifier.h> #include <uapi/linux/mroute.h> #include <linux/mroute_base.h> #include <linux/sockptr.h> #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { return opt >= MRT_BASE && opt <= MRT_MAX; } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); bool ipmr_rule_default(const struct fib_rule *rule); int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } static inline int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } static inline int ip_mr_init(void) { return 0; } static inline int ip_mroute_opt(int opt) { return 0; } static inline bool ipmr_rule_default(const struct fib_rule *rule) { return true; } static inline int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { return 1; } #endif #define VIFF_STATIC 0x8000 struct mfc_cache_cmp_arg { __be32 mfc_mcastgrp; __be32 mfc_origin; }; /** * struct mfc_cache - multicast routing entries * @_c: Common multicast routing information; has to be first [for casting] * @mfc_mcastgrp: destination multicast group address * @mfc_origin: source address * @cmparg: used for rhashtable comparisons */ struct mfc_cache { struct mr_mfc _c; union { struct { __be32 mfc_mcastgrp; __be32 mfc_origin; }; struct mfc_cache_cmp_arg cmparg; }; }; struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); #endif
7 8 7 7 5 5 7 7 7 7 7 7 6 5 5 5 5 6 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 // SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> #include "internal.h" struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; #define F_DENTRY(filp) ((filp)->f_path.dentry) const struct file_operations *debugfs_real_fops(const struct file *filp) { struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) { /* * Urgh, we've been called w/o a protecting * debugfs_file_get(). */ WARN_ON(1); return NULL; } return fsd->real_fops; } EXPORT_SYMBOL_GPL(debugfs_real_fops); /** * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. * * Up to a matching call to debugfs_file_put(), any successive call * into the file removing functions debugfs_remove() and * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. */ int debugfs_file_get(struct dentry *dentry) { struct debugfs_fsdata *fsd; void *d_fsd; /* * This could only happen if some debugfs user erroneously calls * debugfs_file_get() on a dentry that isn't even a file, let * them know about it. */ if (WARN_ON(!d_is_reg(dentry))) return -EINVAL; d_fsd = READ_ONCE(dentry->d_fsdata); if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; fsd->real_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); mutex_init(&fsd->cancellations_mtx); if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) { mutex_destroy(&fsd->cancellations_mtx); kfree(fsd); fsd = READ_ONCE(dentry->d_fsdata); } } /* * In case of a successful cmpxchg() above, this check is * strictly necessary and must follow it, see the comment in * __debugfs_remove_file(). * OTOH, if the cmpxchg() hasn't been executed or wasn't * successful, this serves the purpose of not starving * removers. */ if (d_unlinked(dentry)) return -EIO; if (!refcount_inc_not_zero(&fsd->active_users)) return -EIO; return 0; } EXPORT_SYMBOL_GPL(debugfs_file_get); /** * debugfs_file_put - mark the end of file data access * @dentry: the dentry object formerly passed to * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to * debugfs_file_get() to proceed and return to its caller. */ void debugfs_file_put(struct dentry *dentry) { struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); if (refcount_dec_and_test(&fsd->active_users)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_file_put); /** * debugfs_enter_cancellation - enter a debugfs cancellation * @file: the file being accessed * @cancellation: the cancellation object, the cancel callback * inside of it must be initialized * * When a debugfs file is removed it needs to wait for all active * operations to complete. However, the operation itself may need * to wait for hardware or completion of some asynchronous process * or similar. As such, it may need to be cancelled to avoid long * waits or even deadlocks. * * This function can be used inside a debugfs handler that may * need to be cancelled. As soon as this function is called, the * cancellation's 'cancel' callback may be called, at which point * the caller should proceed to call debugfs_leave_cancellation() * and leave the debugfs handler function as soon as possible. * Note that the 'cancel' callback is only ever called in the * context of some kind of debugfs_remove(). * * This function must be paired with debugfs_leave_cancellation(). */ void debugfs_enter_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); INIT_LIST_HEAD(&cancellation->list); if (WARN_ON(!d_is_reg(dentry))) return; if (WARN_ON(!cancellation->cancel)) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); list_add(&cancellation->list, &fsd->cancellations); mutex_unlock(&fsd->cancellations_mtx); /* if we're already removing wake it up to cancel */ if (d_unlinked(dentry)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_enter_cancellation); /** * debugfs_leave_cancellation - leave cancellation section * @file: the file being accessed * @cancellation: the cancellation previously registered with * debugfs_enter_cancellation() * * See the documentation of debugfs_enter_cancellation(). */ void debugfs_leave_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); if (WARN_ON(!d_is_reg(dentry))) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); if (!list_empty(&cancellation->list)) list_del(&cancellation->list); mutex_unlock(&fsd->cancellations_mtx); } EXPORT_SYMBOL_GPL(debugfs_leave_cancellation); /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) return -EPERM; return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } replace_fops(filp, real_fops); if (real_fops->open) r = real_fops->open(inode, filp); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_open_proxy_file_operations = { .open = open_proxy_open, }; #define PROTO(args...) args #define ARGS(args...) args #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops; \ ret_type r; \ \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ real_fops = debugfs_real_fops(filp); \ r = real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } FULL_PROXY_FUNC(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence)); FULL_PROXY_FUNC(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg)); static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); __poll_t r = 0; const struct file_operations *real_fops; if (debugfs_file_get(dentry)) return EPOLLHUP; real_fops = debugfs_real_fops(filp); r = real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; /* * We must not protect this against removal races here: the * original releaser should be called unconditionally in order * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ if (real_fops->release) r = real_fops->release(inode, filp); replace_fops(filp, d_inode(dentry)->i_fop); kfree(proxy_fops); fops_put(real_fops); return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, const struct file_operations *real_fops) { proxy_fops->release = full_proxy_release; if (real_fops->llseek) proxy_fops->llseek = full_proxy_llseek; if (real_fops->read) proxy_fops->read = full_proxy_read; if (real_fops->write) proxy_fops->write = full_proxy_write; if (real_fops->poll) proxy_fops->poll = full_proxy_poll; if (real_fops->unlocked_ioctl) proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } static int full_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; struct file_operations *proxy_fops = NULL; int r; r = debugfs_file_get(dentry); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); if (!proxy_fops) { r = -ENOMEM; goto free_proxy; } __full_proxy_fops_init(proxy_fops, real_fops); replace_fops(filp, proxy_fops); if (real_fops->open) { r = real_fops->open(inode, filp); if (r) { replace_fops(filp, d_inode(dentry)->i_fop); goto free_proxy; } else if (filp->f_op != proxy_fops) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); goto free_proxy; } } goto out; free_proxy: kfree(proxy_fops); fops_put(real_fops); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_full_proxy_file_operations = { .open = full_proxy_open, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; ret = simple_attr_read(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; if (is_signed) ret = simple_attr_write_signed(file, buf, len, ppos); else ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(debugfs_attr_write); ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, const struct file_operations *fops_ro, const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_wo); return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; return 0; } static int debugfs_u8_get(void *data, u64 *val) { *val = *(u8 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); static int debugfs_u16_set(void *data, u64 val) { *(u16 *)data = val; return 0; } static int debugfs_u16_get(void *data, u64 *val) { *val = *(u16 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); static int debugfs_u32_set(void *data, u64 val) { *(u32 *)data = val; return 0; } static int debugfs_u32_get(void *data, u64 *val) { *val = *(u32 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); static int debugfs_u64_set(void *data, u64 val) { *(u64 *)data = val; return 0; } static int debugfs_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); static int debugfs_ulong_set(void *data, u64 val) { *(unsigned long *)data = val; return 0; } static int debugfs_ulong_get(void *data, u64 *val) { *val = *(unsigned long *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write * an unsigned long value. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned * decimal functions. */ /** * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); /** * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); /** * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); /** * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); static int debugfs_size_t_set(void *data, u64 val) { *(size_t *)data = val; return 0; } static int debugfs_size_t_get(void *data, u64 *val) { *val = *(size_t *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); static int debugfs_atomic_t_set(void *data, u64 val) { atomic_set((atomic_t *)data, val); return 0; } static int debugfs_atomic_t_get(void *data, u64 *val) { *val = atomic_read((atomic_t *)data); return 0; } DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and * write an atomic_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); r = debugfs_file_get(dentry); if (unlikely(r)) return r; val = *(bool *)file->private_data; debugfs_file_put(dentry); if (val) buf[0] = 'Y'; else buf[0] = 'N'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); r = kstrtobool_from_user(user_buf, count, &bv); if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; *val = bv; debugfs_file_put(dentry); } return count; } EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { .read = debugfs_read_file_bool, .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_ro = { .read = debugfs_read_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_wo = { .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *str, *copy = NULL; int copy_len, len; ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; str = *(char **)file->private_data; len = strlen(str) + 1; copy = kmalloc(len, GFP_KERNEL); if (!copy) { debugfs_file_put(dentry); return -ENOMEM; } copy_len = strscpy(copy, str, len); debugfs_file_put(dentry); if (copy_len < 0) { kfree(copy); return copy_len; } copy[copy_len] = '\n'; ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; } EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *old, *new = NULL; int pos = *ppos; int r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; old = *(char **)file->private_data; /* only allow strict concatenation */ r = -EINVAL; if (pos && pos != strlen(old)) goto error; r = -E2BIG; if (pos + count + 1 > PAGE_SIZE) goto error; r = -ENOMEM; new = kmalloc(pos + count + 1, GFP_KERNEL); if (!new) goto error; if (pos) memcpy(new, old, pos); r = -EFAULT; if (copy_from_user(new + pos, user_buf, count)) goto error; new[pos + count] = '\0'; strim(new); rcu_assign_pointer(*(char __rcu **)file->private_data, new); synchronize_rcu(); kfree(old); debugfs_file_put(dentry); return count; error: kfree(new); debugfs_file_put(dentry); return r; } static const struct file_operations fops_str = { .read = debugfs_read_file_str, .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_ro = { .read = debugfs_read_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_wo = { .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_str - create a debugfs file that is used to read and write a string value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, &fops_str_ro, &fops_str_wo); } static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_read_from_buffer(user_buf, count, ppos, blob->data, blob->size); debugfs_file_put(dentry); return r; } static ssize_t write_file_blob(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, count); debugfs_file_put(dentry); return r; } static const struct file_operations fops_blob = { .read = read_file_blob, .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_blob - create a debugfs file that is used to read and write * a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); static size_t u32_format_array(char *buf, size_t bufsize, u32 *array, int array_size) { size_t ret = 0; while (--array_size >= 0) { size_t len; char term = array_size ? ' ' : '\n'; len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; buf += len; bufsize -= len; } return ret; } static int u32_array_open(struct inode *inode, struct file *file) { struct debugfs_u32_array *data = inode->i_private; int size, elements = data->n_elements; char *buf; /* * Max size: * - 10 digits + ' '/'\n' = 11 bytes per number * - terminating NUL character */ size = elements*11; buf = kmalloc(size+1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[size] = 0; file->private_data = buf; u32_format_array(buf, size, data->array, data->n_elements); return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); } static int u32_array_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release = u32_array_release, .read = u32_array_read, .llseek = no_llseek, }; /** * debugfs_create_u32_array - create a debugfs file that is used to read u32 * array. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @array: wrapper struct containing data pointer and size of the array. * * This function creates a file in debugfs with the given name that exports * @array as data. If the @mode variable is so set it can be read from. * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. */ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array) { debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); #ifdef CONFIG_HAS_IOMEM /* * The regset32 stuff is used to print 32-bit registers using the * seq_file utilities. We offer printing a register set in an already-opened * sequential file or create a debugfs file that only prints a regset32. */ /** * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * * This function outputs a text block describing the current values of * some 32-bit hardware registers. It is meant to be used within debugfs * files based on seq_file that need to show registers, intermixed with other * information. The prefix argument may be used to specify a leading string, * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix) { int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) seq_printf(s, "%s", prefix); seq_printf(s, "%s = 0x%08x\n", regs->name, readl(base + regs->offset)); if (seq_has_overflowed(s)) break; } } EXPORT_SYMBOL_GPL(debugfs_print_regs32); static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; if (regset->dev) pm_runtime_get_sync(regset->dev); debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); if (regset->dev) pm_runtime_put(regset->dev); return 0; } DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @regset: a pointer to a struct debugfs_regset32, which contains a pointer * to an array of register definitions, the array size and the base * address where the register bank is to be found. * * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. */ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ struct debugfs_devm_entry { int (*read)(struct seq_file *seq, void *data); struct device *dev; }; static int debugfs_devm_entry_open(struct inode *inode, struct file *f) { struct debugfs_devm_entry *entry = inode->i_private; return single_open(f, entry->read, entry->dev); } static const struct file_operations debugfs_devm_entry_ops = { .owner = THIS_MODULE, .open = debugfs_devm_entry_open, .release = single_release, .read = seq_read, .llseek = seq_lseek }; /** * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. * * @dev: device related to this debugfs file. * @name: name of the debugfs file. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ void debugfs_create_devm_seqfile(struct device *dev, const char *name, struct dentry *parent, int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->read = read_fn; entry->dev = dev; debugfs_create_file(name, S_IRUGO, parent, entry, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-add_2.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_ptr_t s2_ptr, mpi_size_t size) { mpi_limb_t x, y, cy; mpi_size_t j; /* The loop counter and index J goes from -SIZE to -1. This way the loop becomes faster. */ j = -size; /* Offset the base pointers to compensate for the negative indices. */ s1_ptr -= j; s2_ptr -= j; res_ptr -= j; cy = 0; do { y = s2_ptr[j]; x = s1_ptr[j]; y += cy; /* add previous carry to subtrahend */ cy = y < cy; /* get out carry from that addition */ y = x - y; /* main subtract */ cy += y > x; /* get out carry from the subtract, combine */ res_ptr[j] = y; } while (++j); return cy; }
3 3 3 33 27 27 27 43 33 33 33 33 33 33 16 33 33 33 49 49 27 27 27 3 1 26 1 26 48 49 48 1 48 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 49 49 49 41 41 49 10 8 8 10 42 49 49 49 48 48 48 48 48 41 41 3 16 18 18 18 4 48 11 14 13 13 14 12 14 12 12 12 3 3 12 12 12 12 11 12 48 49 49 19 18 19 49 49 48 49 49 49 9 10 10 10 10 10 9 10 10 10 10 9 1 10 9 9 9 9 9 10 10 10 10 10 11 11 10 9 9 9 1 9 9 9 9 9 9 9 9 9 9 1 1 10 9 9 9 9 9 9 3 3 10 9 1 1 3 9 9 10 1 10 10 10 9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions implement the sctp_outq class. The outqueue handles * bundling and queueing of outgoing SCTP chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Perry Melange <pmelange@null.cc.uic.edu> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/slab.h> #include <net/sock.h> /* For skb_set_owner_w */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> #include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn); static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn, int count_of_newacks); static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { return q->sched->dequeue(q); } /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } /* * SFR-CACC algorithm: * D) If count_of_newacks is greater than or equal to 2 * and t was not sent to the current primary then the * sender MUST NOT increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks >= 2 && transport != primary) return 1; return 0; } /* * SFR-CACC algorithm: * F) If count_of_newacks is less than 2, let d be the * destination to which t was sent. If cacc_saw_newack * is 0 for destination d, then the sender MUST NOT * increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks < 2 && (transport && !transport->cacc.cacc_saw_newack)) return 1; return 0; } /* * SFR-CACC algorithm: * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD * execute steps C, D, F. * * C has been implemented in sctp_outq_sack */ static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (!primary->cacc.cycling_changeover) { if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) return 1; if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) return 1; return 0; } return 0; } /* * SFR-CACC algorithm: * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less * than next_tsn_at_change of the current primary, then * the sender MUST NOT increment missing report count * for t. */ static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) { if (primary->cacc.cycling_changeover && TSN_lt(tsn, primary->cacc.next_tsn_at_change)) return 1; return 0; } /* * SFR-CACC algorithm: * 3) If the missing report count for TSN t is to be * incremented according to [RFC2960] and * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, * then the sender MUST further execute steps 3.1 and * 3.2 to determine if the missing report count for * TSN t SHOULD NOT be incremented. * * 3.3) If 3.1 and 3.2 do not dictate that the missing * report count for t should not be incremented, then * the sender SHOULD increment missing report count for * t (according to [RFC2960] and [SCTP_STEWART_2002]). */ static inline int sctp_cacc_skip(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks, __u32 tsn) { if (primary->cacc.changeover_active && (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) || sctp_cacc_skip_3_2(primary, tsn))) return 1; return 0; } /* Initialize an existing sctp_outq. This does the boring stuff. * You still need to define handlers if you really want to DO * something with this structure... */ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) { memset(q, 0, sizeof(struct sctp_outq)); q->asoc = asoc; INIT_LIST_HEAD(&q->out_chunk_list); INIT_LIST_HEAD(&q->control_chunk_list); INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); sctp_sched_set_sched(asoc, sctp_sk(asoc->base.sk)->default_ss); } /* Free the outqueue structure and any related pending chunks. */ static void __sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *temp; struct sctp_chunk *chunk, *tmp; /* Throw away unacknowledged chunks. */ list_for_each_entry(transport, &q->asoc->peer.transport_addr_list, transports) { while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* Mark as part of a failed message. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } } /* Throw away chunks that have been gap ACKed. */ list_for_each_safe(lchunk, temp, &q->sacked) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks in the retransmit queue. */ list_for_each_safe(lchunk, temp, &q->retransmit) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks that are in the abandoned queue. */ list_for_each_safe(lchunk, temp, &q->abandoned) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover control chunks. */ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } void sctp_outq_teardown(struct sctp_outq *q) { __sctp_outq_teardown(q); sctp_outq_init(q->asoc, q); } /* Free the outqueue structure and any related pending chunks. */ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ __sctp_outq_teardown(q); } /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); /* If it is data, queue it up, otherwise, send it * immediately. */ if (sctp_chunk_is_data(chunk)) { pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); sctp_outq_tail_data(q, chunk); if (chunk->asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); } if (!q->cork) sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list * and the abandoned list are in ascending order. */ static void sctp_insert_list(struct list_head *head, struct list_head *new) { struct list_head *pos; struct sctp_chunk *nchunk, *lchunk; __u32 ntsn, ltsn; int done = 0; nchunk = list_entry(new, struct sctp_chunk, transmitted_list); ntsn = ntohl(nchunk->subh.data_hdr->tsn); list_for_each(pos, head) { lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); ltsn = ntohl(lchunk->subh.data_hdr->tsn); if (TSN_lt(ntsn, ltsn)) { list_add(new, pos->prev); done = 1; break; } } if (!done) list_add_tail(new, head); } static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct list_head *queue, int msg_len) { struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; if (!chk->msg->abandoned && (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (queue != &asoc->outqueue.retransmit && !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); } msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); if (msg_len <= 0) break; } return msg_len; } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; struct sctp_stream_out *sout; q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; /* clear out_curr if all frag chunks are pruned */ if (asoc->stream.out_curr == sout && list_is_last(&chk->frag_list, &chk->msg->chunks)) asoc->stream.out_curr = NULL; msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) break; } q->sched->sched_all(&asoc->stream); return msg_len; } /* Abandon the chunks according their priorities */ void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_transport *transport; if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) return; msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &asoc->outqueue.retransmit, msg_len); if (msg_len <= 0) return; list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &transport->transmitted, msg_len); if (msg_len <= 0) return; } sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, __u8 reason) { struct list_head *lchunk, *ltemp; struct sctp_chunk *chunk; /* Walk through the specified transmitted queue. */ list_for_each_safe(lchunk, ltemp, &transport->transmitted) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(lchunk); sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been previousely acked, * stop considering it 'outstanding'. Our peer * will most likely never see it since it will * not be retransmitted */ if (!chunk->tsn_gap_acked) { if (chunk->transport) chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); q->asoc->peer.rwnd += sctp_data_size(chunk); } continue; } /* If we are doing retransmission due to a timeout or pmtu * discovery, only the chunks that are not yet acked should * be added to the retransmit queue. */ if ((reason == SCTP_RTXR_FAST_RTX && (chunk->fast_retransmit == SCTP_NEED_FRTX)) || (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) { /* RFC 2960 6.2.1 Processing a Received SACK * * C) Any time a DATA chunk is marked for * retransmission (via either T3-rtx timer expiration * (Section 6.3.3) or via fast retransmit * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ q->asoc->peer.rwnd += sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); /* sctpimpguide-05 Section 2.8.2 * M5) If a T3-rtx timer expires, the * 'TSN.Missing.Report' of all affected TSNs is set * to 0. */ chunk->tsn_missing_report = 0; /* If a chunk that is being used for RTT measurement * has to be retransmitted, we cannot use this chunk * anymore for RTT measurements. Reset rto_pending so * that a new RTT measurement is started when a new * data chunk is sent. */ if (chunk->rtt_in_progress) { chunk->rtt_in_progress = 0; transport->rto_pending = 0; } /* Move the chunk to the retransmit queue. The chunks * on the retransmit queue are always kept in order. */ list_del_init(lchunk); sctp_insert_list(&q->retransmit, lchunk); } } pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } /* Mark all the eligible packets on a transport for retransmission and force * one packet out. */ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); /* Update the retran path if the T3-rtx timer has expired for * the current retran path. */ if (transport == transport->asoc->peer.retran_path) sctp_assoc_update_retran_path(transport->asoc); transport->asoc->rtx_data_chunks += transport->asoc->unack_data; if (transport->pl.state == SCTP_PL_COMPLETE && transport->asoc->unack_data) sctp_transport_reset_probe_timer(transport); break; case SCTP_RTXR_FAST_RTX: SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); q->fast_rtx = 1; break; case SCTP_RTXR_PMTUD: SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS); break; case SCTP_RTXR_T1_RTX: SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS); transport->asoc->init_retries++; break; default: BUG(); } sctp_retransmit_mark(q, transport, reason); /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); } /* * Transmit DATA chunks on the retransmit queue. Upon return from * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which * need to be transmitted by the caller. * We assume that pkt->transport has already been set. * * The return value is a normal kernel error return value. */ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int rtx_timeout, int *start_timer, gfp_t gfp) { struct sctp_transport *transport = pkt->transport; struct sctp_chunk *chunk, *chunk1; struct list_head *lqueue; enum sctp_xmit status; int error = 0; int timer = 0; int done = 0; int fast_rtx; lqueue = &q->retransmit; fast_rtx = q->fast_rtx; /* This loop handles time-out retransmissions, fast retransmissions, * and retransmissions due to opening of whindow. * * RFC 2960 6.3.3 Handle T3-rtx Expiration * * E3) Determine how many of the earliest (i.e., lowest TSN) * outstanding DATA chunks for the address for which the * T3-rtx has expired will fit into a single packet, subject * to the MTU constraint for the path corresponding to the * destination transport address to which the retransmission * is being sent (this may be different from the address for * which the timer expires [see Section 6.4]). Call this value * K. Bundle and retransmit those K DATA chunks in a single * packet to the destination endpoint. * * [Just to be painfully clear, if we are retransmitting * because a timeout just happened, we should send only ONE * packet of retransmitted data.] * * For fast retransmissions we also send only ONE packet. However, * if we are just flushing the queue due to open window, we'll * try to send as much as possible. */ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(&chunk->transmitted_list); sctp_insert_list(&q->abandoned, &chunk->transmitted_list); continue; } /* Make sure that Gap Acked TSNs are not retransmitted. A * simple approach is just to move such TSNs out of the * way and into a 'transmitted' queue and skip to the * next chunk. */ if (chunk->tsn_gap_acked) { list_move_tail(&chunk->transmitted_list, &transport->transmitted); continue; } /* If we are doing fast retransmit, ignore non-fast_rtransmit * chunks */ if (fast_rtx && !chunk->fast_retransmit) continue; redo: /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); switch (status) { case SCTP_XMIT_PMTU_FULL: if (!pkt->has_data && !pkt->has_cookie_echo) { /* If this packet did not contain DATA then * retransmission did not happen, so do it * again. We'll ignore the error here since * control chunks are already freed so there * is nothing we can do. */ sctp_packet_transmit(pkt, gfp); goto redo; } /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* If we are retransmitting, we should only * send a single packet. * Otherwise, try appending this chunk again. */ if (rtx_timeout || fast_rtx) done = 1; else goto redo; /* Bundle next chunk in the next round. */ break; case SCTP_XMIT_RWND_FULL: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA as there is no more room * at the receiver. */ done = 1; break; case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA because of nagle delay. */ done = 1; break; default: /* The append was successful, so add this chunk to * the transmitted list. */ list_move_tail(&chunk->transmitted_list, &transport->transmitted); /* Mark the chunk as ineligible for fast retransmit * after it is retransmitted. */ if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; q->asoc->stats.rtxchunks++; break; } /* Set the timer if there were no errors */ if (!error && !timer) timer = 1; if (done) break; } /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. */ if (rtx_timeout || fast_rtx) { list_for_each_entry(chunk1, lqueue, transmitted_list) { if (chunk1->fast_retransmit == SCTP_NEED_FRTX) chunk1->fast_retransmit = SCTP_DONT_FRTX; } } *start_timer = timer; /* Clear fast retransmit hint */ if (fast_rtx) q->fast_rtx = 0; return error; } /* Cork the outqueue so queued chunks are really queued. */ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; sctp_outq_flush(q, 0, gfp); } static int sctp_packet_singleton(struct sctp_transport *transport, struct sctp_chunk *chunk, gfp_t gfp) { const struct sctp_association *asoc = transport->asoc; const __u16 sport = asoc->base.bind_addr.port; const __u16 dport = asoc->peer.port; const __u32 vtag = asoc->peer.i.init_tag; struct sctp_packet singleton; sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { list_del_init(&chunk->list); sctp_chunk_free(chunk); return -ENOMEM; } return sctp_packet_transmit(&singleton, gfp); } /* Struct to hold the context during sctp outq flush */ struct sctp_flush_ctx { struct sctp_outq *q; /* Current transport being used. It's NOT the same as curr active one */ struct sctp_transport *transport; /* These transports have chunks to send. */ struct list_head transport_list; struct sctp_association *asoc; /* Packet on the current transport above */ struct sctp_packet *packet; gfp_t gfp; }; /* transport: current transport */ static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx, struct sctp_chunk *chunk) { struct sctp_transport *new_transport = chunk->transport; if (!new_transport) { if (!sctp_chunk_is_data(chunk)) { /* If we have a prior transport pointer, see if * the destination address of the chunk * matches the destination address of the * current transport. If not a match, then * try to look up the transport with a given * destination address. We do this because * after processing ASCONFs, we may have new * transports created. */ if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest, &ctx->transport->ipaddr)) new_transport = ctx->transport; else new_transport = sctp_assoc_lookup_paddr(ctx->asoc, &chunk->dest); } /* if we still don't have a new transport, then * use the current active path. */ if (!new_transport) new_transport = ctx->asoc->peer.active_path; } else { __u8 type; switch (new_transport->state) { case SCTP_INACTIVE: case SCTP_UNCONFIRMED: case SCTP_PF: /* If the chunk is Heartbeat or Heartbeat Ack, * send it to chunk->transport, even if it's * inactive. * * 3.3.6 Heartbeat Acknowledgement: * ... * A HEARTBEAT ACK is always sent to the source IP * address of the IP datagram containing the * HEARTBEAT chunk to which this ack is responding. * ... * * ASCONF_ACKs also must be sent to the source. */ type = chunk->chunk_hdr->type; if (type != SCTP_CID_HEARTBEAT && type != SCTP_CID_HEARTBEAT_ACK && type != SCTP_CID_ASCONF_ACK) new_transport = ctx->asoc->peer.active_path; break; default: break; } } /* Are we switching transports? Take care of transport locks. */ if (new_transport != ctx->transport) { ctx->transport = new_transport; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); /* We've switched transports, so apply the * Burst limit to the new transport. */ sctp_transport_burst_limited(ctx->transport); } } static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) { struct sctp_chunk *chunk, *tmp; enum sctp_xmit status; int one_packet, error; list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) { one_packet = 0; /* RFC 5061, 5.3 * F1) This means that until such time as the ASCONF * containing the add is acknowledged, the sender MUST * NOT use the new IP address as a source for ANY SCTP * packet except on carrying an ASCONF Chunk. */ if (ctx->asoc->src_out_of_asoc_ok && chunk->chunk_hdr->type != SCTP_CID_ASCONF) continue; list_del_init(&chunk->list); /* Pick the right transport to use. Should always be true for * the first chunk as we don't have a transport by then. */ sctp_outq_select_transport(ctx, chunk); switch (chunk->chunk_hdr->type) { /* 6.10 Bundling * ... * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN * COMPLETE with any other chunks. [Send them immediately.] */ case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (error < 0) { ctx->asoc->base.sk->sk_err = -error; return; } ctx->asoc->stats.octrlchunks++; break; case SCTP_CID_ABORT: if (sctp_test_T_bit(chunk)) ctx->packet->vtag = ctx->asoc->c.my_vtag; fallthrough; /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can * send only 1 packet containing these chunks. */ case SCTP_CID_HEARTBEAT_ACK: case SCTP_CID_SHUTDOWN_ACK: case SCTP_CID_COOKIE_ACK: case SCTP_CID_COOKIE_ECHO: case SCTP_CID_ERROR: case SCTP_CID_ECN_CWR: case SCTP_CID_ASCONF_ACK: one_packet = 1; fallthrough; case SCTP_CID_HEARTBEAT: if (chunk->pmtu_probe) { error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (!error) ctx->asoc->stats.octrlchunks++; break; } fallthrough; case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(ctx->packet, chunk, one_packet, ctx->gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &ctx->q->control_chunk_list); break; } ctx->asoc->stats.octrlchunks++; /* PR-SCTP C5) If a FORWARD TSN is sent, the * sender MUST assure that at least one T3-rtx * timer is running. */ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } if (chunk == ctx->asoc->strreset_chunk) sctp_transport_reset_reconf_timer(ctx->transport); break; default: /* We built a chunk with an illegal type! */ BUG(); } } } /* Returns false if new data shouldn't be sent */ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx, int rtx_timeout) { int error, start_timer = 0; if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED) return false; if (ctx->transport != ctx->asoc->peer.retran_path) { /* Switch transports & prepare the packet. */ ctx->transport = ctx->asoc->peer.retran_path; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); } error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout, &start_timer, ctx->gfp); if (error < 0) ctx->asoc->base.sk->sk_err = -error; if (start_timer) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. */ if (ctx->packet->has_cookie_echo) return false; /* Don't send new data if there is still data * waiting to retransmit. */ if (!list_empty(&ctx->q->retransmit)) return false; return true; } static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, int rtx_timeout) { struct sctp_chunk *chunk; enum sctp_xmit status; /* Is it OK to send data chunks? */ switch (ctx->asoc->state) { case SCTP_STATE_COOKIE_ECHOED: /* Only allow bundling when this packet has a COOKIE-ECHO * chunk. */ if (!ctx->packet || !ctx->packet->has_cookie_echo) return; fallthrough; case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: break; default: /* Do nothing. */ return; } /* RFC 2960 6.1 Transmission of DATA Chunks * * C) When the time comes for the sender to transmit, * before sending new DATA chunks, the sender MUST * first transmit any outstanding DATA chunks which * are marked for retransmission (limited by the * current cwnd). */ if (!list_empty(&ctx->q->retransmit) && !sctp_outq_flush_rtx(ctx, rtx_timeout)) return; /* Apply Max.Burst limitation to the current transport in * case it will be used for new data. We are going to * rest it before we return, but we want to apply the limit * to the currently queued data. */ if (ctx->transport) sctp_transport_burst_limited(ctx->transport); /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { sctp_sched_dequeue_done(ctx->q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; } if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } sctp_outq_select_transport(ctx, chunk); pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n", __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), chunk->skb ? chunk->skb->head : NULL, chunk->skb ? refcount_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0, ctx->gfp); if (status != SCTP_XMIT_OK) { /* We could not append this chunk, so put * the chunk back on the output queue. */ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", __func__, ntohl(chunk->subh.data_hdr->tsn), status); sctp_outq_head_data(ctx->q, chunk); break; } /* The sender is in the SHUTDOWN-PENDING state, * The sender MAY set the I-bit in the DATA * chunk header. */ if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING) chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) ctx->asoc->stats.ouodchunks++; else ctx->asoc->stats.oodchunks++; /* Only now it's safe to consider this * chunk as sent, sched-wise. */ sctp_sched_dequeue_done(ctx->q, chunk); list_add_tail(&chunk->transmitted_list, &ctx->transport->transmitted); sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. */ if (ctx->packet->has_cookie_echo) break; } } static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) { struct sock *sk = ctx->asoc->base.sk; struct list_head *ltransport; struct sctp_packet *packet; struct sctp_transport *t; int error = 0; while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) { t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { rcu_read_lock(); if (t->dst && __sk_dst_get(sk) != t->dst) { dst_hold(t->dst); sk_setup_caps(sk, t->dst); } rcu_read_unlock(); error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) ctx->q->asoc->base.sk->sk_err = -error; } /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } } /* Try to flush an outqueue. * * Description: Send everything in q which we legally can, subject to * congestion limitations. * * Note: This function can be called from multiple contexts so appropriate * locking concerns must be made. Today we use the sock lock to protect * this function. */ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_flush_ctx ctx = { .q = q, .transport = NULL, .transport_list = LIST_HEAD_INIT(ctx.transport_list), .asoc = q->asoc, .packet = NULL, .gfp = gfp, }; /* 6.10 Bundling * ... * When bundling control chunks with DATA chunks, an * endpoint MUST place control chunks first in the outbound * SCTP packet. The transmitter MUST transmit DATA chunks * within a SCTP packet in increasing order of TSN. * ... */ sctp_outq_flush_ctrl(&ctx); if (q->asoc->src_out_of_asoc_ok) goto sctp_flush_out; sctp_outq_flush_data(&ctx, rtx_timeout); sctp_flush_out: sctp_outq_flush_transports(&ctx); } /* Update unack_data based on the incoming SACK chunk */ static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) { union sctp_sack_variable *frags; __u16 unack_data; int i; unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; frags = (union sctp_sack_variable *)(sack + 1); for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { unack_data -= ((ntohs(frags[i].gab.end) - ntohs(frags[i].gab.start) + 1)); } assoc->unack_data = unack_data; } /* This is where we REALLY process a SACK. * * Process the SACK against the outqueue. Mostly, this just frees * things off the transmitted queue. */ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) { struct sctp_association *asoc = q->asoc; struct sctp_sackhdr *sack = chunk->subh.sack_hdr; struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; __u32 sack_ctsn, ctsn, tsn; __u32 highest_tsn, highest_new_tsn; __u32 sack_a_rwnd; unsigned int outstanding; struct sctp_transport *primary = asoc->peer.primary_path; int count_of_newacks = 0; int gap_ack_blocks; u8 accum_moved = 0; /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; /* SCTP path tracepoint for congestion control debugging. */ if (trace_sctp_probe_path_enabled()) { list_for_each_entry(transport, transport_list, transports) trace_sctp_probe_path(transport, asoc); } sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; /* * SFR-CACC algorithm: * On receipt of a SACK the sender SHOULD execute the * following statements. * * 1) If the cumulative ack in the SACK passes next tsn_at_change * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for * all destinations. * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE * is set the receiver of the SACK MUST take the following actions: * * A) Initialize the cacc_saw_newack to 0 for all destination * addresses. * * Only bother if changeover_active is set. Otherwise, this is * totally suboptimal to do on every SACK. */ if (primary->cacc.changeover_active) { u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { primary->cacc.changeover_active = 0; clear_cycling = 1; } if (clear_cycling || gap_ack_blocks) { list_for_each_entry(transport, transport_list, transports) { if (clear_cycling) transport->cacc.cycling_changeover = 0; if (gap_ack_blocks) transport->cacc.cacc_saw_newack = 0; } } } /* Get the highest TSN in the sack. */ highest_tsn = sack_ctsn; if (gap_ack_blocks) { union sctp_sack_variable *frags = (union sctp_sack_variable *)(sack + 1); highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); } if (TSN_lt(asoc->highest_sacked, highest_tsn)) asoc->highest_sacked = highest_tsn; highest_new_tsn = sack_ctsn; /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. * * This is a MASSIVE candidate for optimization. */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, transport, &chunk->source, sack, &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of * destinations for which cacc_saw_newack is set. */ if (transport->cacc.cacc_saw_newack) count_of_newacks++; } /* Move the Cumulative TSN Ack Point if appropriate. */ if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { asoc->ctsn_ack_point = sack_ctsn; accum_moved = 1; } if (gap_ack_blocks) { if (asoc->fast_recovery && accum_moved) highest_new_tsn = highest_tsn; list_for_each_entry(transport, transport_list, transports) sctp_mark_missing(q, &transport->transmitted, transport, highest_new_tsn, count_of_newacks); } /* Update unack_data field in the assoc. */ sctp_sack_update_unack_data(asoc, sack); ctsn = asoc->ctsn_ack_point; /* Throw away stuff rotting on the sack queue. */ list_for_each_safe(lchunk, temp, &q->sacked) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } /* ii) Set rwnd equal to the newly received a_rwnd minus the * number of bytes still outstanding after processing the * Cumulative TSN Ack and the Gap Ack Blocks. */ sack_a_rwnd = ntohl(sack->a_rwnd); asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) sack_a_rwnd -= outstanding; else sack_a_rwnd = 0; asoc->peer.rwnd = sack_a_rwnd; asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " "advertised peer ack point:0x%x\n", __func__, asoc, ctsn, asoc->adv_peer_ack_point); return sctp_outq_is_empty(q); } /* Is the outqueue empty? * The queue is empty when we have not pending data, no in-flight data * and nothing pending retransmissions. */ int sctp_outq_is_empty(const struct sctp_outq *q) { return q->out_qlen == 0 && q->outstanding_bytes == 0 && list_empty(&q->retransmit); } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Go through a transport's transmitted list or the association's retransmit * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. * The retransmit list will not have an associated transport. * * I added coherent debug information output. --xguo * * Instead of printing 'sacked' or 'kept' for each TSN on the * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. * KEPT TSN6-TSN7, etc. */ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn_in_sack) { struct list_head *lchunk; struct sctp_chunk *tchunk; struct list_head tlist; __u32 tsn; __u32 sack_ctsn; __u32 rtt; __u8 restart_timer = 0; int bytes_acked = 0; int migrate_bytes = 0; bool forward_progress = false; sack_ctsn = ntohl(sack->cum_tsn_ack); INIT_LIST_HEAD(&tlist); /* The while loop will skip empty transmitted queues. */ while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); if (sctp_chunk_abandoned(tchunk)) { /* Move the chunk to abandoned list. */ sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ if (transmitted_queue != &q->retransmit && !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); q->outstanding_bytes -= sctp_data_size(tchunk); } continue; } tsn = ntohl(tchunk->subh.data_hdr->tsn); if (sctp_acked(sack, tsn)) { /* If this queue is the retransmit queue, the * retransmit timer has already reclaimed * the outstanding bytes for this chunk, so only * count bytes associated with a transport. */ if (transport && !tchunk->tsn_gap_acked) { /* If this chunk is being used for RTT * measurement, calculate the RTT and update * the RTO using this value. * * 6.3.1 C5) Karn's algorithm: RTT measurements * MUST NOT be made using packets that were * retransmitted (and thus for which it is * ambiguous whether the reply was for the * first instance of the packet or a later * instance). */ if (!sctp_chunk_retransmitted(tchunk) && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); } if (TSN_lte(tsn, sack_ctsn)) { /* * SFR-CACC algorithm: * 2) If the SACK contains gap acks * and the flag CHANGEOVER_ACTIVE is * set the receiver of the SACK MUST * take the following action: * * B) For each TSN t being acked that * has not been acked in any SACK so * far, set cacc_saw_newack to 1 for * the destination that the TSN was * sent to. */ if (sack->num_gap_ack_blocks && q->asoc->peer.primary_path->cacc. changeover_active) transport->cacc.cacc_saw_newack = 1; } } /* If the chunk hasn't been marked as ACKED, * mark it and account bytes_acked if the * chunk had a valid transport (it will not * have a transport if ASCONF had deleted it * while DATA was outstanding). */ if (!tchunk->tsn_gap_acked) { tchunk->tsn_gap_acked = 1; if (TSN_lt(*highest_new_tsn_in_sack, tsn)) *highest_new_tsn_in_sack = tsn; bytes_acked += sctp_data_size(tchunk); if (!tchunk->transport) migrate_bytes += sctp_data_size(tchunk); forward_progress = true; } if (TSN_lte(tsn, sack_ctsn)) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R3) Whenever a SACK is received * that acknowledges the DATA chunk * with the earliest outstanding TSN * for that address, restart T3-rtx * timer for that address with its * current RTO. */ restart_timer = 1; forward_progress = true; list_add_tail(&tchunk->transmitted_list, &q->sacked); } else { /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 * M2) Each time a SACK arrives reporting * 'Stray DATA chunk(s)' record the highest TSN * reported as newly acknowledged, call this * value 'HighestTSNinSack'. A newly * acknowledged DATA chunk is one not * previously acknowledged in a SACK. * * When the SCTP sender of data receives a SACK * chunk that acknowledges, for the first time, * the receipt of a DATA chunk, all the still * unacknowledged DATA chunks whose TSN is * older than that newly acknowledged DATA * chunk, are qualified as 'Stray DATA chunks'. */ list_add_tail(lchunk, &tlist); } } else { if (tchunk->tsn_gap_acked) { pr_debug("%s: receiver reneged on data TSN:0x%x\n", __func__, tsn); tchunk->tsn_gap_acked = 0; if (tchunk->transport) bytes_acked -= sctp_data_size(tchunk); /* RFC 2960 6.3.2 Retransmission Timer Rules * * R4) Whenever a SACK is received missing a * TSN that was previously acknowledged via a * Gap Ack Block, start T3-rtx for the * destination address to which the DATA * chunk was originally * transmitted if it is not already running. */ restart_timer = 1; } list_add_tail(lchunk, &tlist); } } if (transport) { if (bytes_acked) { struct sctp_association *asoc = transport->asoc; /* We may have counted DATA that was migrated * to this transport due to DEL-IP operation. * Subtract those bytes, since the were never * send on this transport and shouldn't be * credited to this transport. */ bytes_acked -= migrate_bytes; /* 8.2. When an outstanding TSN is acknowledged, * the endpoint shall clear the error counter of * the destination transport address to which the * DATA chunk was last sent. * The association's overall error counter is * also cleared. */ transport->error_count = 0; transport->asoc->overall_error_count = 0; forward_progress = true; /* * While in SHUTDOWN PENDING, we may have started * the T5 shutdown guard timer after reaching the * retransmission limit. Stop that timer as soon * as the receiver acknowledged any data. */ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && del_timer(&asoc->timers [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) sctp_association_put(asoc); /* Mark the destination transport address as * active if it is not so marked. */ if ((transport->state == SCTP_INACTIVE || transport->state == SCTP_UNCONFIRMED) && sctp_cmp_addr_exact(&transport->ipaddr, saddr)) { sctp_assoc_control_transport( transport->asoc, transport, SCTP_TRANSPORT_UP, SCTP_RECEIVED_SACK); } sctp_transport_raise_cwnd(transport, sack_ctsn, bytes_acked); transport->flight_size -= bytes_acked; if (transport->flight_size == 0) transport->partial_bytes_acked = 0; q->outstanding_bytes -= bytes_acked + migrate_bytes; } else { /* RFC 2960 6.1, sctpimpguide-06 2.15.2 * When a sender is doing zero window probing, it * should not timeout the association if it continues * to receive new packets from the receiver. The * reason is that the receiver MAY keep its window * closed for an indefinite time. * A sender is doing zero window probing when the * receiver's advertised window is zero, and there is * only one data chunk in flight to the receiver. * * Allow the association to timeout while in SHUTDOWN * PENDING or SHUTDOWN RECEIVED in case the receiver * stays in zero window mode forever. */ if (!q->asoc->peer.rwnd && !list_empty(&tlist) && (sack_ctsn+2 == q->asoc->next_tsn) && q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { pr_debug("%s: sack received for zero window " "probe:%u\n", __func__, sack_ctsn); q->asoc->overall_error_count = 0; transport->error_count = 0; } } /* RFC 2960 6.3.2 Retransmission Timer Rules * * R2) Whenever all outstanding data sent to an address have * been acknowledged, turn off the T3-rtx timer of that * address. */ if (!transport->flight_size) { if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } if (forward_progress) { if (transport->dst) sctp_transport_dst_confirm(transport); } } list_splice(&tlist, transmitted_queue); } /* Mark chunks as missing and consequently may get retransmitted. */ static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn_in_sack, int count_of_newacks) { struct sctp_chunk *chunk; __u32 tsn; char do_fast_retransmit = 0; struct sctp_association *asoc = q->asoc; struct sctp_transport *primary = asoc->peer.primary_path; list_for_each_entry(chunk, transmitted_queue, transmitted_list) { tsn = ntohl(chunk->subh.data_hdr->tsn); /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all * 'Unacknowledged TSN's', if the TSN number of an * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' * value, increment the 'TSN.Missing.Report' count on that * chunk if it has NOT been fast retransmitted or marked for * fast retransmit already. */ if (chunk->fast_retransmit == SCTP_CAN_FRTX && !chunk->tsn_gap_acked && TSN_lt(tsn, highest_new_tsn_in_sack)) { /* SFR-CACC may require us to skip marking * this chunk as missing. */ if (!transport || !sctp_cacc_skip(primary, chunk->transport, count_of_newacks, tsn)) { chunk->tsn_missing_report++; pr_debug("%s: tsn:0x%x missing counter:%d\n", __func__, tsn, chunk->tsn_missing_report); } } /* * M4) If any DATA chunk is found to have a * 'TSN.Missing.Report' * value larger than or equal to 3, mark that chunk for * retransmission and start the fast retransmit procedure. */ if (chunk->tsn_missing_report >= 3) { chunk->fast_retransmit = SCTP_NEED_FRTX; do_fast_retransmit = 1; } } if (transport) { if (do_fast_retransmit) sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } } /* Is the given TSN acked by this packet? */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { __u32 ctsn = ntohl(sack->cum_tsn_ack); union sctp_sack_variable *frags; __u16 tsn_offset, blocks; int i; if (TSN_lte(tsn, ctsn)) goto pass; /* 3.3.4 Selective Acknowledgment (SACK) (3): * * Gap Ack Blocks: * These fields contain the Gap Ack Blocks. They are repeated * for each Gap Ack Block up to the number of Gap Ack Blocks * defined in the Number of Gap Ack Blocks field. All DATA * chunks with TSNs greater than or equal to (Cumulative TSN * Ack + Gap Ack Block Start) and less than or equal to * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack * Block are assumed to have been received correctly. */ frags = (union sctp_sack_variable *)(sack + 1); blocks = ntohs(sack->num_gap_ack_blocks); tsn_offset = tsn - ctsn; for (i = 0; i < blocks; ++i) { if (tsn_offset >= ntohs(frags[i].gab.start) && tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } return 0; pass: return 1; } static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, int nskips, __be16 stream) { int i; for (i = 0; i < nskips; i++) { if (skiplist[i].stream == stream) return i; } return i; } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct sctp_fwdtsn_skip ftsn_skip_arr[10]; int nskips = 0; int skip_pos = 0; __u32 tsn; struct sctp_chunk *chunk; struct list_head *lchunk, *temp; if (!asoc->peer.prsctp_capable) return; /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * * If (Advanced.Peer.Ack.Point < SackCumAck), then update * Advanced.Peer.Ack.Point to be equal to SackCumAck. */ if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as * the chunk next in the out-queue space is marked as "abandoned" as * shown in the following example: * * Assuming that a SACK arrived with the Cumulative TSN ACK 102 * and the Advanced.Peer.Ack.Point is updated to this value: * * out-queue at the end of ==> out-queue after Adv.Ack.Point * normal SACK processing local advancement * ... ... * Adv.Ack.Pt-> 102 acked 102 acked * 103 abandoned 103 abandoned * 104 abandoned Adv.Ack.P-> 104 abandoned * 105 105 * 106 acked 106 acked * ... ... * * In this example, the data sender successfully advanced the * "Advanced.Peer.Ack.Point" from 102 to 104 locally. */ list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); /* Remove any chunks in the abandoned queue that are acked by * the ctsn. */ if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else { if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { asoc->adv_peer_ack_point = tsn; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) continue; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, chunk->subh.data_hdr->stream); ftsn_skip_arr[skip_pos].stream = chunk->subh.data_hdr->stream; ftsn_skip_arr[skip_pos].ssn = chunk->subh.data_hdr->ssn; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else break; } } /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" * is greater than the Cumulative TSN ACK carried in the received * SACK, the data sender MUST send the data receiver a FORWARD TSN * chunk containing the latest value of the * "Advanced.Peer.Ack.Point". * * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD * list each stream and sequence number in the forwarded TSN. This * information will enable the receiver to easily find any * stranded TSN's waiting on stream reorder queues. Each stream * SHOULD only be reported once; this means that if multiple * abandoned messages occur in the same stream then only the * highest abandoned stream sequence number is reported. If the * total size of the FORWARD TSN does NOT fit in a single MTU then * the sender of the FORWARD TSN SHOULD lower the * Advanced.Peer.Ack.Point to the last TSN that will fit in a * single MTU. */ if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } }
95 10 1 8 61 58 61 60 60 62 148 148 148 148 124 1 90 90 3 6 1 1 96 20 21 20 9 9 83 53 98 38 3 3 3 13 35 3 3 3 3 5 1 217 3 70 144 146 88 87 87 31 90 3 18 7 63 61 100 519 278 408 411 2 2 1 1 2 2 1 10 10 48 82 10 41 219 218 2425 2416 59 62 65 602 611 251 411 128 10 43 1 219 212 212 18 527 320 44 251 41 363 40 2 11 16 1 18 41 36 7 18 7 7 323 13 5 236 8 8 8 8 919 265 330 3 3 3 3 4 24 30 11 155 15 15 22 19 100 75 25 8 24 35 128 25 649 650 653 77 645 37 44 152 43 679 132 135 135 132 135 133 7 39 39 1 96 14 146 146 146 138 25 25 25 8 8 2 36 53 20 129 19 10 11 10 9 10 53 478 51 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) { max_mapnr = limit; } #else static inline void set_max_mapnr(unsigned long limit) { } #endif extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; extern int page_cluster; extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); /* Use only if VMA has no other users */ void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #ifdef CONFIG_PPC # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #else # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 38 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. */ static inline bool vma_start_read(struct vm_area_struct *vma) { /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) return false; /* * Overflow might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } return true; } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ up_read(&vma->vm_lock->lock); rcu_read_unlock(); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq; return (vma->vm_lock_seq == *mm_lock_seq); } /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; down_write(&vma->vm_lock->lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { if (!rwsem_is_locked(&vma->vm_lock->lock)) vma_assert_write_locked(vma); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ if (detached) vma_assert_write_locked(vma); vma->detached = detached; } static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void vma_assert_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); } static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio->_flags_1 & 0xff; } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio->_flags_1 & 0xff; } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes - it does not include PTE-mapped sub-pages; look * at folio_mapcount() or page_mapcount() instead. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; } /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } /** * page_mapcount() - Number of times this precise page is mapped. * @page: The page. * * The number of times this page is mapped. If this page is part of * a large folio, it includes the number of times this page is mapped * as part of that folio. * * Will report 0 for pages which cannot be mapped into userspace, eg * slab, page tables and similar. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; /* Handle page_has_type() pages */ if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) mapcount = 0; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); return mapcount; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount;