25 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { return container_of(priv, struct Qdisc, privdata); } /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); #define NET_SCH_ALIAS_PREFIX "net-sch-" #define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #endif
11 5 5 5 104 95 12 11 11 10 4 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, };
6 9 14 543 253 35 350 379 1248 535 122 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 // SPDX-License-Identifier: GPL-2.0+ /* * ext4_jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1998--1999 Red Hat corp --- All Rights Reserved * * Ext4-specific journaling extensions. */ #ifndef _EXT4_JBD2_H #define _EXT4_JBD2_H #include <linux/fs.h> #include <linux/jbd2.h> #include "ext4.h" #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) /* Define the number of blocks we need to account to a transaction to * modify one block of data. * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to * 5 levels of tree, data block (for each of these we need bitmap + group * summaries), root which is stored in the inode, sb */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode * and the superblock, which are already accounted for. */ #define EXT4_XATTR_TRANS_BLOCKS 6U /* Define the minimum size for a transaction which modifies data. This * needs to take into account the fact that we may end up modifying two * quota files too (one for the group, one for the user quota). The * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. * * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always * start off at the maximum transaction size and grow the transaction * optimistically as we go. */ #define EXT4_MAX_TRANS_DATA 64U /* We break up a large truncate or write transaction once the handle's * buffer credits gets this low, we need either to extend the * transaction or to start a new one. Reserve enough space here for * inode, bitmap, superblock, group and indirection updates for at least * one block, plus two quota updates. Quota allocations are not * needed. */ #define EXT4_RESERVE_TRANS_BLOCKS 12U /* * Number of credits needed if we need to insert an entry into a * directory. For each new index block, we need 4 blocks (old index * block, new index block, bitmap block, bg summary). For normal * htree directories there are 2 levels; if the largedir feature * enabled it's 3 levels. */ #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) /* * Ext4 handle operation types -- for logging purposes */ #define EXT4_HT_MISC 0 #define EXT4_HT_INODE 1 #define EXT4_HT_WRITE_PAGE 2 #define EXT4_HT_MAP_BLOCKS 3 #define EXT4_HT_DIR 4 #define EXT4_HT_TRUNCATE 5 #define EXT4_HT_QUOTA 6 #define EXT4_HT_RESIZE 7 #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. * * This struct is a 'seed' structure for a using with your own callback * structs. If you are using callbacks you must allocate one of these * or another struct of your own definition which has this struct * as it's first element and pass it to ext4_journal_callback_add(). */ struct ext4_journal_cb_entry { /* list information for other callbacks attached to the same handle */ struct list_head jce_list; /* Function to call with this callback structure */ void (*jce_func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error); /* user data goes here */ }; /** * ext4_journal_callback_add: add a function to call after transaction commit * @handle: active journal transaction handle to register callback on * @func: callback function to call after the transaction has committed: * @sb: superblock of current filesystem for transaction * @jce: returned journal callback data * @rc: journal state at commit (0 = transaction committed properly) * @jce: journal callback data (internal and function private data struct) * * The registered function will be called in the context of the journal thread * after the transaction for which the handle was created has completed. * * No locks are held when the callback function is called, so it is safe to * call blocking functions from within the callback, but the callback should * not block or run for too long, or the filesystem will be blocked waiting for * the next transaction to commit. No journaling functions can be used, or * there is a risk of deadlock. * * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ static inline void _ext4_journal_callback_add(handle_t *handle, struct ext4_journal_cb_entry *jce) { /* Add the jce to transaction's private list */ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); } static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc), struct ext4_journal_cb_entry *jce) { struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); return deleted; } int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); #define ext4_mark_inode_dirty(__h, __i) \ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) #define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) /* Note: Do not use this for NULL handles. This is only to determine if * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } static inline void ext4_handle_sync(handle_t *handle) { if (ext4_handle_valid(handle)) handle->h_sync = 1; } static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) return is_handle_aborted(handle); return 0; } static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, int blocks) { /* Freeing each metadata block can result in freeing one cluster */ return blocks * EXT4_SB(sb)->s_cluster_ratio; } static inline int ext4_trans_default_revoke_credits(struct super_block *sb) { return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) #define ext4_journal_start_reserved(handle, type) \ __ext4_journal_start_reserved((handle), __LINE__, (type)) handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); } static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2_journal_extend(handle, nblocks, revoke); return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred); /* * Ensure @handle has at least @check_creds credits available. If not, * transaction will be extended or restarted to contain at least @extend_cred * credits. Before restarting transaction @fn is executed to allow for cleanup * before the transaction is restarted. * * The return value is < 0 in case of error, 0 in case the handle has enough * credits or transaction extension succeeded, 1 in case transaction had to be * restarted. */ #define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ revoke_cred, fn) \ ({ \ __label__ __ensure_end; \ int err = __ext4_journal_ensure_credits((handle), (check_cred), \ (extend_cred), (revoke_cred)); \ \ if (err <= 0) \ goto __ensure_end; \ err = (fn); \ if (err < 0) \ goto __ensure_end; \ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ if (err == 0) \ err = 1; \ __ensure_end: \ err; \ }) /* * Ensure given handle has at least requested amount of credits available, * possibly restarting transaction if needed. We also make sure the transaction * has space for at least ext4_trans_default_revoke_credits(sb) revoke records * as freeing one or two blocks is very common pattern and requesting this is * very cheap. */ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, int revoke_creds) { return ext4_journal_ensure_credits_fn(handle, credits, credits, revoke_creds, 0); } static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) return jbd2_journal_blocks_per_page(inode); return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { if (journal) return jbd2_journal_force_commit(journal); return 0; } static inline int ext4_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_write(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline int ext4_jbd2_inode_add_wait(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_wait(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline void ext4_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; } } /* super.c */ int ext4_force_commit(struct super_block *sb); /* * Ext4 inode journal modes */ #define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) { if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 0; if (!ext4_should_journal_data(inode)) return 0; /* * Data blocks in one extent are contiguous, just account for partial * clusters at extent boundaries */ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); } /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; /* temporary fix to prevent generic/422 test failures */ if (!test_opt(inode->i_sb, DELALLOC)) return 0; return 1; } #endif /* _EXT4_JBD2_H */
10330 10300 70 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/net_namespace.h> #include <linux/if_arp.h> #include <net/rtnetlink.h> static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); dev_kfree_skb(skb); return NETDEV_TX_OK; } struct nlmon { struct netlink_tap nt; }; static int nlmon_open(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); nlmon->nt.dev = dev; nlmon->nt.module = THIS_MODULE; return netlink_add_tap(&nlmon->nt); } static int nlmon_close(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); return netlink_remove_tap(&nlmon->nt); } static void nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops nlmon_ethtool_ops = { .get_link = always_on, }; static const struct net_device_ops nlmon_ops = { .ndo_open = nlmon_open, .ndo_stop = nlmon_close, .ndo_start_xmit = nlmon_xmit, .ndo_get_stats64 = nlmon_get_stats64, }; static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_LLTX; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; /* That's rather a softlimit here, which, of course, * can be altered. Not a real MTU, but what is to be * expected in most cases. */ dev->mtu = NLMSG_GOODSIZE; dev->min_mtu = sizeof(struct nlmsghdr); } static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) return -EINVAL; return 0; } static struct rtnl_link_ops nlmon_link_ops __read_mostly = { .kind = "nlmon", .priv_size = sizeof(struct nlmon), .setup = nlmon_setup, .validate = nlmon_validate, }; static __init int nlmon_register(void) { return rtnl_link_register(&nlmon_link_ops); } static __exit void nlmon_unregister(void) { rtnl_link_unregister(&nlmon_link_ops); } module_init(nlmon_register); module_exit(nlmon_unregister); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); MODULE_DESCRIPTION("Netlink monitoring device"); MODULE_ALIAS_RTNL_LINK("nlmon");
7 2 3 3 1 1 2 2 38 34 1 6 41 114 729 4 32 35 36 2 9 11 27 1 1 26 3 1 20 11 2 9 1 5 3 48 49 49 12 2 2 33 30 1 43 28 1 4 33 25 831 862 843 142 694 64 2 2 2 3 1 2 2 2 2 2 1 2 3 1 3 4 4 1 3 1 2 1 3 2 4 1 3 1 2 1 2 4 2 1 1 2 2 1 1 1 567 110 432 1 2 1 7 3 1 3 4 2 2 3 2 3 1 2 1 2 1 2 1 3 2 3 1 3 2 1 2 2 1 1 5 2 4 4 2 1 2 2 3 1 2 2 2 3 1 5 1 2 2 3 3 5 1 1 20 7 2 1 2 3 1 16 4 11 28 1 4 2 102 29 2 574 110 863 9 25 553 296 12 3 4 2 1 2 4 5 7 1 6 1 2 3 152 159 13 143 41 5 2 2 5 1 27 9 26 10 25 11 26 10 26 10 29 7 1 1 2 1 1 1 1 12 2 1 2 1 1 1 2 1 1 1 4 1 3 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 42 42 167 7 8 118 21 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT: val = inet6_test_bit(RTALERT, sk); break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
1094 1062 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. * 2000-10-29 Henner Eisen lapb_data_indication() return status. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <net/lapb.h> static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* * Free an allocated lapb control block. */ static void lapb_free_cb(struct lapb_cb *lapb) { kfree(lapb); } static __inline__ void lapb_hold(struct lapb_cb *lapb) { refcount_inc(&lapb->refcnt); } static __inline__ void lapb_put(struct lapb_cb *lapb) { if (refcount_dec_and_test(&lapb->refcnt)) lapb_free_cb(lapb); } /* * Socket removal during an interrupt is now safe. */ static void __lapb_remove_cb(struct lapb_cb *lapb) { if (lapb->node.next) { list_del(&lapb->node); lapb_put(lapb); } } /* * Add a socket to the bound sockets list. */ static void __lapb_insert_cb(struct lapb_cb *lapb) { list_add(&lapb->node, &lapb_list); lapb_hold(lapb); } static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { struct lapb_cb *lapb, *use = NULL; list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; } } if (use) lapb_hold(use); return use; } static struct lapb_cb *lapb_devtostruct(struct net_device *dev) { struct lapb_cb *rc; read_lock_bh(&lapb_list_lock); rc = __lapb_devtostruct(dev); read_unlock_bh(&lapb_list_lock); return rc; } /* * Create an empty LAPB control block. */ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); if (!lapb) goto out; skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); lapb->t1timer_running = false; lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; lapb->n2 = LAPB_DEFAULT_N2; lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; } int lapb_register(struct net_device *dev, const struct lapb_register_struct *callbacks) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (lapb) { lapb_put(lapb); goto out; } lapb = lapb_create_cb(); rc = LAPB_NOMEM; if (!lapb) goto out; lapb->dev = dev; lapb->callbacks = callbacks; __lapb_insert_cb(lapb); lapb_start_t1timer(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (!lapb) goto out; lapb_put(lapb); /* Wait for other refs to "lapb" to drop */ while (refcount_read(&lapb->refcnt) > 2) usleep_range(1, 10); spin_lock_bh(&lapb->lock); lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ del_timer_sync(&lapb->t1timer); del_timer_sync(&lapb->t2timer); __lapb_remove_cb(lapb); lapb_put(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; parms->n2count = lapb->n2count; parms->state = lapb->state; parms->window = lapb->window; parms->mode = lapb->mode; if (!timer_pending(&lapb->t1timer)) parms->t1timer = 0; else parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; if (!timer_pending(&lapb->t2timer)) parms->t2timer = 0; else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: return rc; } EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; if (lapb->state == LAPB_STATE_0) { if (parms->mode & LAPB_EXTENDED) { if (parms->window < 1 || parms->window > 127) goto out_put; } else { if (parms->window < 1 || parms->window > 7) goto out_put; } lapb->mode = parms->mode; lapb->window = parms->window; } lapb->t1 = parms->t1 * HZ; lapb->t2 = parms->t2 * HZ; lapb->n2 = parms->n2; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; rc = LAPB_CONNECTED; if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) goto out_put; lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev); lapb->state = LAPB_STATE_1; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); static int __lapb_disconnect_request(struct lapb_cb *lapb) { switch (lapb->state) { case LAPB_STATE_0: return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); return LAPB_NOTCONNECTED; case LAPB_STATE_2: return LAPB_OK; } lapb_clear_queues(lapb); lapb->n2count = 0; lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); lapb->state = LAPB_STATE_2; lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); return LAPB_OK; } int lapb_disconnect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = __lapb_disconnect_request(lapb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; skb_queue_tail(&lapb->write_queue, skb); lapb_kick(lapb); rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (lapb) { spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } return rc; } EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_confirmation) lapb->callbacks->connect_confirmation(lapb->dev, reason); } void lapb_connect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_indication) lapb->callbacks->connect_indication(lapb->dev, reason); } void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_confirmation) lapb->callbacks->disconnect_confirmation(lapb->dev, reason); } void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_indication) lapb->callbacks->disconnect_indication(lapb->dev, reason); } int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) { if (lapb->callbacks->data_indication) return lapb->callbacks->data_indication(lapb->dev, skb); kfree_skb(skb); return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) { int used = 0; if (lapb->callbacks->data_transmit) { lapb->callbacks->data_transmit(lapb->dev, skb); used = 1; } return used; } /* Handle device status changes. */ static int lapb_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct lapb_cb *lapb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type != ARPHRD_X25) return NOTIFY_DONE; lapb = lapb_devtostruct(dev); if (!lapb) return NOTIFY_DONE; spin_lock_bh(&lapb->lock); switch (event) { case NETDEV_UP: lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } break; case NETDEV_GOING_DOWN: if (netif_carrier_ok(dev)) __lapb_disconnect_request(lapb); break; case NETDEV_DOWN: lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); break; case NETDEV_CHANGE: if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } else { lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); } break; } spin_unlock_bh(&lapb->lock); lapb_put(lapb); return NOTIFY_DONE; } static struct notifier_block lapb_dev_notifier = { .notifier_call = lapb_device_event, }; static int __init lapb_init(void) { return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); MODULE_LICENSE("GPL"); module_init(lapb_init); module_exit(lapb_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. */ #include <linux/bits.h> #include <linux/serial_8250.h> #include <linux/serial_reg.h> #include <linux/dmaengine.h> #include "../serial_mctrl_gpio.h" struct uart_8250_dma { int (*tx_dma)(struct uart_8250_port *p); int (*rx_dma)(struct uart_8250_port *p); void (*prepare_tx_dma)(struct uart_8250_port *p); void (*prepare_rx_dma)(struct uart_8250_port *p); /* Filter function */ dma_filter_fn fn; /* Parameter to the filter function */ void *rx_param; void *tx_param; struct dma_slave_config rxconf; struct dma_slave_config txconf; struct dma_chan *rxchan; struct dma_chan *txchan; /* Device address base for DMA operations */ phys_addr_t rx_dma_addr; phys_addr_t tx_dma_addr; /* DMA address of the buffer in memory */ dma_addr_t rx_addr; dma_addr_t tx_addr; dma_cookie_t rx_cookie; dma_cookie_t tx_cookie; void *rx_buf; size_t rx_size; size_t tx_size; unsigned char tx_running; unsigned char tx_err; unsigned char rx_running; }; struct old_serial_port { unsigned int uart; unsigned int baud_base; unsigned int port; unsigned int irq; upf_t flags; unsigned char io_type; unsigned char __iomem *iomem_base; unsigned short iomem_reg_shift; }; struct serial8250_config { const char *name; unsigned short fifo_size; unsigned short tx_loadsz; unsigned char fcr; unsigned char rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE]; unsigned int flags; }; #define UART_CAP_FIFO BIT(8) /* UART has FIFO */ #define UART_CAP_EFR BIT(9) /* UART has EFR */ #define UART_CAP_SLEEP BIT(10) /* UART has IER sleep */ #define UART_CAP_AFE BIT(11) /* MCR-based hw flow control */ #define UART_CAP_UUE BIT(12) /* UART needs IER bit 6 set (Xscale) */ #define UART_CAP_RTOIE BIT(13) /* UART needs IER bit 4 set (Xscale, Tegra) */ #define UART_CAP_HFIFO BIT(14) /* UART has a "hidden" FIFO */ #define UART_CAP_RPM BIT(15) /* Runtime PM is active while idle */ #define UART_CAP_IRDA BIT(16) /* UART supports IrDA line discipline */ #define UART_CAP_MINI BIT(17) /* Mini UART on BCM283X family lacks: * STOP PARITY EPAR SPAR WLEN5 WLEN6 */ #define UART_CAP_NOTEMT BIT(18) /* UART without interrupt on TEMT available */ #define UART_BUG_QUOT BIT(0) /* UART has buggy quot LSB */ #define UART_BUG_TXEN BIT(1) /* UART has buggy TX IIR status */ #define UART_BUG_NOMSR BIT(2) /* UART has buggy MSR status bits (Au1x00) */ #define UART_BUG_THRE BIT(3) /* UART has buggy THRE reassertion */ #define UART_BUG_TXRACE BIT(5) /* UART Tx fails to set remote DR */ #ifdef CONFIG_SERIAL_8250_SHARE_IRQ #define SERIAL8250_SHARE_IRQS 1 #else #define SERIAL8250_SHARE_IRQS 0 #endif #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ { \ .iobase = _base, \ .irq = _irq, \ .uartclk = 1843200, \ .iotype = UPIO_PORT, \ .flags = UPF_BOOT_AUTOCONF | (_flags), \ } #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0) static inline int serial_in(struct uart_8250_port *up, int offset) { return up->port.serial_in(&up->port, offset); } static inline void serial_out(struct uart_8250_port *up, int offset, int value) { up->port.serial_out(&up->port, offset, value); } /** * serial_lsr_in - Read LSR register and preserve flags across reads * @up: uart 8250 port * * Read LSR register and handle saving non-preserved flags across reads. * The flags that are not preserved across reads are stored into * up->lsr_saved_flags. * * Returns LSR value or'ed with the preserved flags (if any). */ static inline u16 serial_lsr_in(struct uart_8250_port *up) { u16 lsr = up->lsr_saved_flags; lsr |= serial_in(up, UART_LSR); up->lsr_saved_flags = lsr & up->lsr_save_mask; return lsr; } /* * For the 16C950 */ static void serial_icr_write(struct uart_8250_port *up, int offset, int value) { serial_out(up, UART_SCR, offset); serial_out(up, UART_ICR, value); } static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up, int offset) { unsigned int value; serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD); serial_out(up, UART_SCR, offset); value = serial_in(up, UART_ICR); serial_icr_write(up, UART_ACR, up->acr); return value; } void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p); static inline u32 serial_dl_read(struct uart_8250_port *up) { return up->dl_read(up); } static inline void serial_dl_write(struct uart_8250_port *up, u32 value) { up->dl_write(up, value); } static inline bool serial8250_set_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } static inline bool serial8250_clear_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } struct uart_8250_port *serial8250_get_port(int line); void serial8250_rpm_get(struct uart_8250_port *p); void serial8250_rpm_put(struct uart_8250_port *p); void serial8250_rpm_get_tx(struct uart_8250_port *p); void serial8250_rpm_put_tx(struct uart_8250_port *p); int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485); void serial8250_em485_start_tx(struct uart_8250_port *p); void serial8250_em485_stop_tx(struct uart_8250_port *p); void serial8250_em485_destroy(struct uart_8250_port *p); extern struct serial_rs485 serial8250_em485_supported; /* MCR <-> TIOCM conversion */ static inline int serial8250_TIOCM_to_MCR(int tiocm) { int mcr = 0; if (tiocm & TIOCM_RTS) mcr |= UART_MCR_RTS; if (tiocm & TIOCM_DTR) mcr |= UART_MCR_DTR; if (tiocm & TIOCM_OUT1) mcr |= UART_MCR_OUT1; if (tiocm & TIOCM_OUT2) mcr |= UART_MCR_OUT2; if (tiocm & TIOCM_LOOP) mcr |= UART_MCR_LOOP; return mcr; } static inline int serial8250_MCR_to_TIOCM(int mcr) { int tiocm = 0; if (mcr & UART_MCR_RTS) tiocm |= TIOCM_RTS; if (mcr & UART_MCR_DTR) tiocm |= TIOCM_DTR; if (mcr & UART_MCR_OUT1) tiocm |= TIOCM_OUT1; if (mcr & UART_MCR_OUT2) tiocm |= TIOCM_OUT2; if (mcr & UART_MCR_LOOP) tiocm |= TIOCM_LOOP; return tiocm; } /* MSR <-> TIOCM conversion */ static inline int serial8250_MSR_to_TIOCM(int msr) { int tiocm = 0; if (msr & UART_MSR_DCD) tiocm |= TIOCM_CAR; if (msr & UART_MSR_RI) tiocm |= TIOCM_RNG; if (msr & UART_MSR_DSR) tiocm |= TIOCM_DSR; if (msr & UART_MSR_CTS) tiocm |= TIOCM_CTS; return tiocm; } static inline void serial8250_out_MCR(struct uart_8250_port *up, int value) { serial_out(up, UART_MCR, value); if (up->gpios) mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value)); } static inline int serial8250_in_MCR(struct uart_8250_port *up) { int mctrl; mctrl = serial_in(up, UART_MCR); if (up->gpios) { unsigned int mctrl_gpio = 0; mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio); mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio); } return mctrl; } bool alpha_jensen(void); void alpha_jensen_set_mctrl(struct uart_port *port, unsigned int mctrl); #ifdef CONFIG_SERIAL_8250_PNP int serial8250_pnp_init(void); void serial8250_pnp_exit(void); #else static inline int serial8250_pnp_init(void) { return 0; } static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_FINTEK int fintek_8250_probe(struct uart_8250_port *uart); #else static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; } #endif #ifdef CONFIG_ARCH_OMAP1 #include <linux/soc/ti/omap1-soc.h> static inline int is_omap1_8250(struct uart_8250_port *pt) { int res; switch (pt->port.mapbase) { case OMAP1_UART1_BASE: case OMAP1_UART2_BASE: case OMAP1_UART3_BASE: res = 1; break; default: res = 0; break; } return res; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { if (!cpu_is_omap1510()) return 0; return is_omap1_8250(pt); } #else static inline int is_omap1_8250(struct uart_8250_port *pt) { return 0; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { return 0; } #endif #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); extern void serial8250_release_dma(struct uart_8250_port *); static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_tx_dma) dma->prepare_tx_dma(p); } static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_rx_dma) dma->prepare_rx_dma(p); } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; return dma && dma->tx_running; } #else static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_request_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_release_dma(struct uart_8250_port *p) { } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { return false; } #endif static inline int ns16550a_goto_highspeed(struct uart_8250_port *up) { unsigned char status; status = serial_in(up, 0x04); /* EXCR2 */ #define PRESL(x) ((x) & 0x30) if (PRESL(status) == 0x10) { /* already in high speed mode */ return 0; } else { status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */ status |= 0x10; /* 1.625 divisor for baud_base --> 921600 */ serial_out(up, 0x04, status); } return 1; } static inline int serial_index(struct uart_port *port) { return port->minor - 64; }
2 2 4 1 3 4 4 4 6 6 2 4 6 6 16 1 2 1 15 14 4 10 4 6 1 9 3 1 1 1 4 1 4 4 4 3 1 1 4 4 4 18 2 2 2 12 2 1 1 1 1 4 4 1 3 1 2 1 1 1 1 1 1 2 1 2 1 5 4 2 18 1 17 22 1 2 4 2 1 1 1079 1057 38 13 1 4 4 16 16 16 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now set the static buffer to * something about 8 kbyte to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get static/dynamic communication params from first/every FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC || so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { /* we do not support multiple buffers - for now */ if (msg->msg_flags & MSG_DONTWAIT) return -EAGAIN; if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit);
791 803 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_DISK_H #define _SCSI_DISK_H /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that * much numberspace. */ #define SD_MAJORS 16 /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ #define SD_TIMEOUT (30 * HZ) #define SD_MOD_TIMEOUT (75 * HZ) /* * Flush timeout is a multiplier over the standard device timeout which is * user modifiable via sysfs but initially set to SD_TIMEOUT */ #define SD_FLUSH_TIMEOUT_MULTIPLIER 2 #define SD_WRITE_SAME_TIMEOUT (120 * HZ) /* * Number of allowed retries */ #define SD_MAX_RETRIES 5 #define SD_PASSTHROUGH_RETRIES 1 #define SD_MAX_MEDIUM_TIMEOUTS 2 /* * Size of the initial data buffer for mode and read capacity data */ #define SD_BUF_SIZE 512 /* * Number of sectors at the end of the device to avoid multi-sector * accesses to in the case of last_sector_bug */ #define SD_LAST_BUGGY_SECTORS 8 enum { SD_EXT_CDB_SIZE = 32, /* Extended CDB size */ SD_MEMPOOL_SIZE = 2, /* CDB pool size */ }; enum { SD_DEF_XFER_BLOCKS = 0xffff, SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, }; enum { SD_LBP_FULL = 0, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; enum { SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; /** * struct zoned_disk_info - Specific properties of a ZBC SCSI device. * @nr_zones: number of zones. * @zone_blocks: number of logical blocks per zone. * * This data structure holds the ZBC SCSI device properties that are retrieved * twice: a first time before the gendisk capacity is known and a second time * after the gendisk capacity is known. */ struct zoned_disk_info { u32 nr_zones; u32 zone_blocks; }; struct scsi_disk { struct scsi_device *device; /* * disk_dev is used to show attributes in /sys/class/scsi_disk/, * but otherwise not really needed. Do not use for refcounting. */ struct device disk_dev; struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED /* Updated during revalidation before the gendisk capacity is known. */ struct zoned_disk_info early_zone_info; /* Updated during revalidation after the gendisk capacity is known. */ struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; /* * Either zero or a power of two. If not zero it means that the offset * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; u32 *zones_wp_offset; spinlock_t zones_wp_offset_lock; u32 *rev_wp_offset; struct mutex rev_mutex; struct work_struct zone_wp_offset_work; char *zone_wp_update_buf; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ int max_retries; u32 min_xfer_blocks; u32 max_xfer_blocks; u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; u8 nr_actuators; /* Number of actuators */ bool suspended; /* Disk is suspended (stopped) */ unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ unsigned RCD : 1; /* state of disk RCD bit, unused */ unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ unsigned first_scan : 1; unsigned lbpme : 1; unsigned lbprz : 1; unsigned lbpu : 1; unsigned lbpws : 1; unsigned lbpws10 : 1; unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; unsigned rc_basis: 2; unsigned zoned: 2; unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) static inline struct scsi_disk *scsi_disk(struct gendisk *disk) { return disk->private_data; } #define sd_printk(prefix, sdsk, fmt, a...) \ (sdsk)->disk ? \ sdev_prefix_printk(prefix, (sdsk)->device, \ (sdsk)->disk->disk_name, fmt, ##a) : \ sdev_printk(prefix, (sdsk)->device, fmt, ##a) #define sd_first_printk(prefix, sdsk, fmt, a...) \ do { \ if ((sdsk)->first_scan) \ sd_printk(prefix, sdsk, fmt, ##a); \ } while (0) static inline int scsi_medium_access_command(struct scsi_cmnd *scmd) { switch (scmd->cmnd[0]) { case READ_6: case READ_10: case READ_12: case READ_16: case SYNCHRONIZE_CACHE: case VERIFY: case VERIFY_12: case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: case WRITE_16: case WRITE_SAME: case WRITE_SAME_16: case UNMAP: return 1; case VARIABLE_LENGTH_CMD: switch (scmd->cmnd[9]) { case READ_32: case VERIFY_32: case WRITE_32: case WRITE_SAME_32: return 1; } } return 0; } static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks) { return blocks << (ilog2(sdev->sector_size) - 9); } static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks) { return blocks * sdev->sector_size; } static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes) { return bytes >> ilog2(sdev->sector_size); } static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector) { return sector >> (ilog2(sdev->sector_size) - 9); } #ifdef CONFIG_BLK_DEV_INTEGRITY extern void sd_dif_config_host(struct scsi_disk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void sd_dif_config_host(struct scsi_disk *disk) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ static inline int sd_is_zoned(struct scsi_disk *sdkp) { return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC; } #ifdef CONFIG_BLK_DEV_ZONED void sd_zbc_free_zone_info(struct scsi_disk *sdkp); int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks); #else /* CONFIG_BLK_DEV_ZONED */ static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {} static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { return 0; } static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { return 0; } static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all) { return BLK_STS_TARGET; } static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { return good_bytes; } static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks) { return BLK_STS_TARGET; } #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr); void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result); #endif /* _SCSI_DISK_H */
345 237 84 26 330 432 50 113 2 727 858 854 856 708 751 755 4 4 4 4 1060 5 689 365 131 209 288 89 4 4 7 4 26 35 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP module. * * Version: @(#)ip.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Changes: * Mike McLagan : Routing by source */ #ifndef _IP_H #define _IP_H #include <linux/types.h> #include <linux/ip.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/sockptr.h> #include <linux/static_key.h> #include <net/inet_sock.h> #include <net/route.h> #include <net/snmp.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/netns/hash.h> #include <net/lwtunnel.h> #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ extern unsigned int sysctl_fib_sync_mem; extern unsigned int sysctl_fib_sync_mem_min; extern unsigned int sysctl_fib_sync_mem_max; struct sock; struct inet_skb_parm { int iif; struct ip_options opt; /* Compiled IP options */ u16 flags; #define IPSKB_FORWARDED BIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) #define IPSKB_XFRM_TRANSFORMED BIT(2) #define IPSKB_FRAG_COMPLETE BIT(3) #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; static inline bool ipv4_l3mdev_skb(u16 flags) { return !!(flags & IPSKB_L3SLAVE); } static inline unsigned int ip_hdrlen(const struct sk_buff *skb) { return ip_hdr(skb)->ihl * 4; } struct ipcm_cookie { struct sockcm_cookie sockc; __be32 addr; int oif; struct ip_options_rcu *opt; __u8 protocol; __u8 ttl; __s16 tos; char priority; __u16 gso_size; }; static inline void ipcm_init(struct ipcm_cookie *ipcm) { *ipcm = (struct ipcm_cookie) { .tos = -1 }; } static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) /* return enslaved device index if relevant */ static inline int inet_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) return IPCB(skb)->iif; #endif return 0; } /* Special input handler for packets caught by router alert option. They are selected only by protocol field, and then processed likely local ones; but only if someone wants them! Otherwise, router not running rsvpd will kill RSVP. It is user level problem, what it will make with them. I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ struct ip_ra_chain { struct ip_ra_chain __rcu *next; struct sock *sk; union { void (*destructor)(struct sock *); struct sock *saved_sk; }; struct rcu_head rcu; }; /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ #define IP_MF 0x2000 /* Flag: "More Fragments" */ #define IP_OFFSET 0x1FFF /* "Fragment Offset" part */ #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ struct msghdr; struct net_device; struct packet_type; struct rtable; struct sockaddr; int igmp_mc_init(void); /* * Functions provided by ip.c */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); struct ip_fraglist_iter { struct sk_buff *frag; struct iphdr *iph; int offset; unsigned int hlen; }; void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter); void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter); static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip_frag_state { bool DF; unsigned int hlen; unsigned int ll_rs; unsigned int mtu; unsigned int left; int offset; int ptr; __be16 not_last_frag; }; void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state); struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int len, int protolen, struct ipcm_cookie *ipc, struct rtable **rt, unsigned int flags); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork); int ip_send_skb(struct net *net, struct sk_buff *skb); int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4); void ip_flush_pending_frames(struct sock *sk); struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } /* Get the route scope that should be used when sending a packet. */ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, const struct ipcm_cookie *ipc, const struct msghdr *msg) { if (sock_flag(&inet->sk, SOCK_LOCALROUTE) || msg->msg_flags & MSG_DONTROUTE || (ipc->opt && ipc->opt->opt.is_strictroute)) return RT_SCOPE_LINK; return RT_SCOPE_UNIVERSE; } static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) { return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos)); } /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); struct ip_reply_arg { struct kvec iov[1]; int flags; __wsum csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ int bound_dev_if; u8 tos; kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) { return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define IP_ADD_STATS(net, field, val) SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define NET_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.net_statistics, field) #define __NET_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.net_statistics, field) #define NET_ADD_STATS(net, field, adnd) SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) #define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) { return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); } unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else static inline u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset) { return snmp_get_cpu_field(mib, cpu, offct); } static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); } #endif #define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff64[i] += snmp_get_cpu_field64( \ mib_statistic, \ c, stats_list[i].entry, \ offset); \ } \ } #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff[i] += snmp_get_cpu_field( \ mib_statistic, \ c, stats_list[i].entry); \ } \ } static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); *low = range & 0xffff; *high = range >> 16; } bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } static inline bool sysctl_dev_name_is_allowed(const char *name) { return strcmp(name, "default") != 0 && strcmp(name, "all") != 0; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { return false; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < PROT_SOCK; } #endif __be32 inet_current_timestamp(void); /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; extern int inet_peer_maxttl; void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; } #ifdef CONFIG_INET #include <net/dst.h> /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline int ip_decrease_ttl(struct iphdr *iph) { u32 check = (__force u32)iph->check; check += (__force u32)htons(0x0100); iph->check = (__force __sum16)(check + (check>=0xFFFF)); return --iph->ttl; } static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *)dst; return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } static inline int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc == IP_PMTUDISC_DO || (pmtudisc == IP_PMTUDISC_WANT && !ip_mtu_locked(dst)); } static inline bool ip_sk_accept_pmtu(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc != IP_PMTUDISC_INTERFACE && pmtudisc != IP_PMTUDISC_OMIT; } static inline bool ip_sk_use_pmtu(const struct sock *sk) { return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE; } static inline bool ip_sk_ignore_df(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT; } static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net *net = dev_net(dst->dev); unsigned int mtu; if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; if (mtu && time_before(jiffies, rt->dst.expires)) goto out; } /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) { if (fib_metrics != &dst_default_metrics && refcount_dec_and_test(&fib_metrics->refcnt)) kfree(fib_metrics); } /* ipv4 and ipv6 both use refcounted metrics if it is not the default */ static inline void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics) { dst_init_metrics(dst, fib_metrics->metrics, true); if (fib_metrics != &dst_default_metrics) { dst->_metrics |= DST_METRICS_REFCOUNTED; refcount_inc(&fib_metrics->refcnt); } } static inline void ip_dst_metrics_put(struct dst_entry *dst) { struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) kfree(p); } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, struct sock *sk, int segs) { struct iphdr *iph = ip_hdr(skb); /* We had many attacks based on IPID, use the private * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { int val; /* avoid atomic operations for TCP, * as we hold socket lock at this point. */ if (sk_is_tcp(sk)) { sock_owned_by_me(sk); val = atomic_read(&inet_sk(sk)->inet_id); atomic_set(&inet_sk(sk)->inet_id, val + segs); } else { val = atomic_add_return(segs, &inet_sk(sk)->inet_id); } iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { iph->id = 0; } else { /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } static inline void ip_select_ident(struct net *net, struct sk_buff *skb, struct sock *sk) { ip_select_ident_segs(net, skb, sk, 1); } static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) { return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len, proto, 0); } /* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v4addrs.src = iph->saddr; * flow->v4addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, const struct iphdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) != offsetof(typeof(flow->addrs), v4addrs.src) + sizeof(flow->addrs.v4addrs.src)); memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } /* * Map a multicast IP onto multicast MAC for type ethernet. */ static inline void ip_eth_mc_map(__be32 naddr, char *buf) { __u32 addr=ntohl(naddr); buf[0]=0x01; buf[1]=0x00; buf[2]=0x5e; buf[5]=addr&0xFF; addr>>=8; buf[4]=addr&0xFF; addr>>=8; buf[3]=addr&0x7F; } /* * Map a multicast IP onto multicast MAC for type IP-over-InfiniBand. * Leave P_Key as 0 to be filled in by driver. */ static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { __u32 addr; unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; addr = ntohl(naddr); buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x40; /* IPv4 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; buf[10] = 0; buf[11] = 0; buf[12] = 0; buf[13] = 0; buf[14] = 0; buf[15] = 0; buf[19] = addr & 0xff; addr >>= 8; buf[18] = addr & 0xff; addr >>= 8; buf[17] = addr & 0xff; addr >>= 8; buf[16] = addr & 0x0f; } static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) memcpy(buf, broadcast, 4); else memcpy(buf, &naddr, sizeof(naddr)); } #if IS_ENABLED(CONFIG_IPV6) #include <linux/ipv6.h> #endif static __inline__ void inet_reset_saddr(struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); memset(&np->saddr, 0, sizeof(np->saddr)); memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); } #endif } #endif static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; } static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } bool ip_call_ra_chain(struct sk_buff *skb); /* * Functions provided by ip_fragment.c */ enum ip_defrag_users { IP_DEFRAG_LOCAL_DELIVER, IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP_DEFRAG_CONNTRACK_OUT, __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD, IP_DEFRAG_AF_PACKET, IP_DEFRAG_MACVLAN, }; /* Return true if the value of 'user' is between 'lower_bond' * and 'upper_bond' inclusively. */ static inline bool ip_defrag_user_in_between(u32 user, enum ip_defrag_users lower_bond, enum ip_defrag_users upper_bond) { return user >= lower_bond && user <= upper_bond; } int ip_defrag(struct net *net, struct sk_buff *skb, u32 user); #ifdef CONFIG_INET struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user); #else static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { return skb; } #endif /* * Functions provided by ip_forward.c */ int ip_forward(struct sk_buff *skb); /* * Functions provided by ip_options.c */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); static inline int ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb) { return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt); } void ip_options_fragment(struct sk_buff *skb); int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen); void ip_options_undo(struct ip_options *opt); void ip_forward_options(struct sk_buff *skb); int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); /* * Functions provided by ip_sockglue.c */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } bool icmp_global_allow(void); extern int sysctl_icmp_msgs_per_sec; extern int sysctl_icmp_msgs_burst; #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); static inline bool inetdev_valid_mtu(unsigned int mtu) { return likely(mtu >= IPV4_MIN_MTU); } void ip_sock_set_freebind(struct sock *sk); int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); void __ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */
136 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } static inline void pde_make_permanent(struct proc_dir_entry *pde) { pde->flags |= PROC_ENTRY_PERMANENT; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); int lsmid; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; }
16 16 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0 /* * devtmpfs - kernel-maintained tmpfs-based /dev * * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org> * * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this * filesystem. * By default, all devices are named after the name of the device, * owned by root and have a default mode of 0600. Subsystems can * overwrite the default setting if needed. */ #define pr_fmt(fmt) "devtmpfs: " fmt #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/init_syscalls.h> #include <uapi/linux/mount.h> #include "base.h" #ifdef CONFIG_DEVTMPFS_SAFE #define DEVTMPFS_MFLAGS (MS_SILENT | MS_NOEXEC | MS_NOSUID) #else #define DEVTMPFS_MFLAGS (MS_SILENT) #endif static struct task_struct *thread; static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT); static DEFINE_SPINLOCK(req_lock); static struct req { struct req *next; struct completion done; int err; const char *name; umode_t mode; /* 0 => delete */ kuid_t uid; kgid_t gid; struct device *dev; } *requests; static int __init mount_param(char *str) { mount_dev = simple_strtoul(str, NULL, 0); return 1; } __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; static struct dentry *public_dev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct super_block *s = mnt->mnt_sb; int err; atomic_inc(&s->s_active); down_write(&s->s_umount); err = reconfigure_single(s, flags, data); if (err < 0) { deactivate_locked_super(s); return ERR_PTR(err); } return dget(s->s_root); } static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS .init_fs_context = shmem_init_fs_context, #else .init_fs_context = ramfs_init_fs_context, #endif .kill_sb = kill_litter_super, }; static struct file_system_type dev_fs_type = { .name = "devtmpfs", .mount = public_dev_mount, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) { init_completion(&req->done); spin_lock(&req_lock); req->next = requests; requests = req; spin_unlock(&req_lock); wake_up_process(thread); wait_for_completion(&req->done); kfree(tmp); return req->err; } int devtmpfs_create_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.mode = 0; req.uid = GLOBAL_ROOT_UID; req.gid = GLOBAL_ROOT_GID; req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp); if (!req.name) return -ENOMEM; if (req.mode == 0) req.mode = 0600; if (is_blockdev(dev)) req.mode |= S_IFBLK; else req.mode |= S_IFCHR; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } int devtmpfs_delete_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp); if (!req.name) return -ENOMEM; req.mode = 0; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); return err; } static int create_path(const char *nodepath) { char *path; char *s; int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; s = path; for (;;) { s = strchr(s, '/'); if (!s) break; s[0] = '\0'; err = dev_mkdir(path, 0755); if (err && err != -EEXIST) break; s[0] = '/'; s++; } kfree(path); return err; } static int handle_create(const char *nodename, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; newattrs.ia_mode = mode; newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } done_path_create(&path, dentry); return err; } static int dev_rmdir(const char *name) { struct path parent; struct dentry *dentry; int err; dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } static int delete_path(const char *nodepath) { char *path; int err = 0; path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; for (;;) { char *base; base = strrchr(path, '/'); if (!base) break; base[0] = '\0'; err = dev_rmdir(path); if (err) break; } kfree(path); return err; } static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) { /* did we create it */ if (inode->i_private != &thread) return 0; /* does the dev_t match */ if (is_blockdev(dev)) { if (!S_ISBLK(stat->mode)) return 0; } else { if (!S_ISCHR(stat->mode)) return 0; } if (stat->rdev != dev->devt) return 0; /* ours */ return 1; } static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; int deleted = 0; int err; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_really_is_positive(dentry)) { struct kstat stat; struct path p = {.mnt = parent.mnt, .dentry = dentry}; err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, AT_STATX_SYNC_AS_STAT); if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { struct iattr newattrs; /* * before unlinking this node, reset permissions * of possible references like hardlinks */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } } else { err = -ENOENT; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; } /* * If configured, or requested by the commandline, devtmpfs will be * auto-mounted after the kernel mounted the root filesystem. */ int __init devtmpfs_mount(void) { int err; if (!mount_dev) return 0; if (!thread) return 0; err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) pr_info("error mounting %d\n", err); else pr_info("mounted\n"); return err; } static __initdata DECLARE_COMPLETION(setup_done); static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { if (mode) return handle_create(name, mode, uid, gid, dev); else return handle_remove(name, dev); } static void __noreturn devtmpfs_work_loop(void) { while (1) { spin_lock(&req_lock); while (requests) { struct req *req = requests; requests = NULL; spin_unlock(&req_lock); while (req) { struct req *next = req->next; req->err = handle(req->name, req->mode, req->uid, req->gid, req->dev); complete(&req->done); req = next; } spin_lock(&req_lock); } __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&req_lock); schedule(); } } static noinline int __init devtmpfs_setup(void *p) { int err; err = ksys_unshare(CLONE_NEWNS); if (err) goto out; err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) goto out; init_chdir("/.."); /* will traverse into overmounted root */ init_chroot("."); out: *(int *)p = err; return err; } /* * The __ref is because devtmpfs_setup needs to be __init for the routines it * calls. That call is done while devtmpfs_init, which is marked __init, * synchronously waits for it to complete. */ static int __ref devtmpfsd(void *p) { int err = devtmpfs_setup(p); complete(&setup_done); if (err) return err; devtmpfs_work_loop(); return 0; } /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { char opts[] = "mode=0755"; int err; mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); if (IS_ERR(mnt)) { pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); return err; } thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); if (!IS_ERR(thread)) { wait_for_completion(&setup_done); } else { err = PTR_ERR(thread); thread = NULL; } if (err) { pr_err("unable to create devtmpfs %d\n", err); unregister_filesystem(&dev_fs_type); thread = NULL; return err; } pr_info("initialized\n"); return 0; }
11 11 8 42 39 42 1 5 1 9 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _DCCP_H #define _DCCP_H /* * net/dccp/dccp.h * * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> */ #include <linux/dccp.h> #include <linux/ktime.h> #include <net/snmp.h> #include <net/sock.h> #include <net/tcp.h> #include "ackvec.h" /* * DCCP - specific warning and debugging macros. */ #define DCCP_WARN(fmt, ...) \ net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) #define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ __FILE__, __LINE__, __func__) #define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) #define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ DCCP_BUG("\"%s\" holds (exception!)", \ __stringify(cond)); \ } while (0) #define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ printk(fmt, ##args); \ } while(0) #define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ "%s: " fmt, __func__, ##a) #ifdef CONFIG_IP_DCCP_DEBUG extern bool dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) do {} while (0) #define dccp_pr_debug_cat(format, a...) do {} while (0) #define dccp_debug(format, a...) do {} while (0) #endif extern struct inet_hashinfo dccp_hashinfo; DECLARE_PER_CPU(unsigned int, dccp_orphan_count); void dccp_time_wait(struct sock *sk, int state, int timeo); /* * Set safe upper bounds for header and option length. Since Data Offset is 8 * bits (RFC 4340, sec. 5.1), the total header length can never be more than * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ /* RFC 1122, 4.2.3.1 initial RTO value */ #define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) /* * The maximum back-off value for retransmissions. This is needed for * - retransmitting client-Requests (sec. 8.1.1), * - retransmitting Close/CloseReq when closing (sec. 8.3), * - feature-negotiation retransmission (sec. 6.6.3), * - Acks in client-PARTOPEN state (sec. 8.1.5). */ #define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 */ #define DCCP_SANE_RTT_MIN 100 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) /* sysctl variables for DCCP */ extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; /* * 48-bit sequence number arithmetic (signed and unsigned) */ #define INT48_MIN 0x800000000000LL /* 2^47 */ #define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ #define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ #define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) #define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) #define ADD48(a, b) (((a) + (b)) & UINT48_MAX) #define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) static inline void dccp_inc_seqno(u64 *seqno) { *seqno = ADD48(*seqno, 1); } /* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) { u64 delta = SUB48(seqno2, seqno1); return TO_SIGNED48(delta); } /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) { return (s64)((seq2 << 16) - (seq1 << 16)) > 0; } /* is seq1 > seq2 ? */ #define after48(seq1, seq2) before48(seq2, seq1) /* is seq2 <= seq1 <= seq3 ? */ static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) { return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); } /** * dccp_loss_count - Approximate the number of lost data packets in a burst loss * @s1: last known sequence number before the loss ('hole') * @s2: first sequence number seen after the 'hole' * @ndp: NDP count on packet with sequence number @s2 */ static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) { s64 delta = dccp_delta_seqno(s1, s2); WARN_ON(delta < 0); delta -= ndp + 1; return delta > 0 ? delta : 0; } /** * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 */ static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) { return dccp_loss_count(s1, s2, ndp) == 0; } enum { DCCP_MIB_NUM = 0, DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ DCCP_MIB_ESTABRESETS, /* EstabResets */ DCCP_MIB_CURRESTAB, /* CurrEstab */ DCCP_MIB_OUTSEGS, /* OutSegs */ DCCP_MIB_OUTRSTS, DCCP_MIB_ABORTONTIMEOUT, DCCP_MIB_TIMEOUTS, DCCP_MIB_ABORTFAILED, DCCP_MIB_PASSIVEOPENS, DCCP_MIB_ATTEMPTFAILS, DCCP_MIB_OUTDATAGRAMS, DCCP_MIB_INERRS, DCCP_MIB_OPTMANDATORYERROR, DCCP_MIB_INVALIDOPT, __DCCP_MIB_MAX }; #define DCCP_MIB_MAX __DCCP_MIB_MAX struct dccp_mib { unsigned long mibs[DCCP_MIB_MAX]; }; DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); #define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) #define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) #define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) /* * Checksumming routines */ static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) { const struct dccp_hdr* dh = dccp_hdr(skb); if (dh->dccph_cscov == 0) return skb->len; return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); } static inline void dccp_csum_outgoing(struct sk_buff *skb) { unsigned int cov = dccp_csum_coverage(skb); if (cov >= skb->len) dccp_hdr(skb)->dccph_cscov = 0; skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); } void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); /* * TX Packet Dequeueing Interface */ void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); bool dccp_qpolicy_full(struct sock *sk); void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); struct sk_buff *dccp_qpolicy_top(struct sock *sk); struct sk_buff *dccp_qpolicy_pop(struct sock *sk); bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); /* * TX Packet Output and TX Timers */ void dccp_write_xmit(struct sock *sk); void dccp_write_space(struct sock *sk); void dccp_flush_write_queue(struct sock *sk, long *time_budget); void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) { inet_csk_clear_xmit_timers(sk); } unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); const char *dccp_packet_name(const int type); void dccp_set_state(struct sock *sk, const int state); void dccp_done(struct sock *sk); int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, struct sk_buff const *skb); int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb); int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req); int dccp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_hdr *dh, unsigned int len); int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len); void dccp_destruct_common(struct sock *sk); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); void dccp_close(struct sock *sk, long timeout); struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req); int dccp_connect(struct sock *sk); int dccp_disconnect(struct sock *sk, int flags); int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int dccp_ioctl(struct sock *sk, int cmd, int *karg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); void dccp_send_close(struct sock *sk, const int active); int dccp_invalid_packet(struct sk_buff *skb); u32 dccp_sample_rtt(struct sock *sk, long delta); static inline bool dccp_bad_service_code(const struct sock *sk, const __be32 service) { const struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_service == service) return false; return !dccp_list_has_service(dp->dccps_service_list, service); } /** * dccp_skb_cb - DCCP per-packet control information * @dccpd_type: one of %dccp_pkt_type (or unknown) * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 * @dccpd_reset_code: one of %dccp_reset_codes * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) * @dccpd_opt_len: total length of all options (5.8) in the packet * @dccpd_seq: sequence number * @dccpd_ack_seq: acknowledgment number subheader field value * * This is used for transmission as well as for reception. */ struct dccp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u8 dccpd_type:4; __u8 dccpd_ccval:4; __u8 dccpd_reset_code, dccpd_reset_data[3]; __u16 dccpd_opt_len; __u64 dccpd_seq; __u64 dccpd_ack_seq; }; #define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) /* RFC 4340, sec. 7.7 */ static inline int dccp_non_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_ACK || type == DCCP_PKT_CLOSE || type == DCCP_PKT_CLOSEREQ || type == DCCP_PKT_RESET || type == DCCP_PKT_SYNC || type == DCCP_PKT_SYNCACK; } /* RFC 4340, sec. 7.7 */ static inline int dccp_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_DATA || type == DCCP_PKT_DATAACK || type == DCCP_PKT_REQUEST || type == DCCP_PKT_RESPONSE; } static inline int dccp_packet_without_ack(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; } #define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) { struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + sizeof(*dh)); dh->dccph_seq2 = 0; dh->dccph_seq = htons((gss >> 32) & 0xfffff); dhx->dccph_seq_low = htonl(gss & 0xffffffff); } static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, const u64 gsr) { dhack->dccph_reserved1 = 0; dhack->dccph_ack_nr_high = htons(gsr >> 32); dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); } static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); if (after48(seq, dp->dccps_gsr)) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); /* * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, * 7.5.1 we perform this check beyond the initial handshake: W/W' are * always > 32, so for the first W/W' packets in the lifetime of a * connection we always have to adjust SWL. * A second reason why we are doing this is that the window depends on * the feature-remote value of Sequence Window: nothing stops the peer * from updating this value while we are busy adjusting SWL for the * first W packets (we would have to count from scratch again then). * Therefore it is safer to always make sure that the Sequence Window * is not artificially extended by a peer who grows SWL downwards by * continually updating the feature-remote Sequence-Window. * If sequence numbers wrap it is bad luck. But that will take a while * (48 bit), and this measure prevents Sequence-number attacks. */ if (before48(dp->dccps_swl, dp->dccps_isr)) dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); /* Adjust AWL so that it is not below ISS - see comment above for SWL */ if (before48(dp->dccps_awl, dp->dccps_iss)) dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } static inline int dccp_ackvec_pending(const struct sock *sk) { return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); } static inline int dccp_ack_pending(const struct sock *sk) { return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); } int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); int dccp_feat_finalise_settings(struct dccp_sock *dp); int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, struct sk_buff *skb); int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); void dccp_feat_list_purge(struct list_head *fn_list); int dccp_insert_options(struct sock *sk, struct sk_buff *skb); int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); u32 dccp_timestamp(void); void dccp_timestamping_init(void); int dccp_insert_option(struct sk_buff *skb, unsigned char option, const void *value, unsigned char len); #ifdef CONFIG_SYSCTL int dccp_sysctl_init(void); void dccp_sysctl_exit(void); #else static inline int dccp_sysctl_init(void) { return 0; } static inline void dccp_sysctl_exit(void) { } #endif #endif /* _DCCP_H */
26 305 315 283 298 29 13 13 6 13 2 8 3 15 9 35 260 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2023 Isovalent */ #ifndef __NET_TCX_H #define __NET_TCX_H #include <linux/bpf.h> #include <linux/bpf_mprog.h> #include <net/sch_generic.h> struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; bool miniq_active; struct rcu_head rcu; }; struct tcx_link { struct bpf_link link; struct net_device *dev; u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) { #ifdef CONFIG_NET_XGRESS skb->tc_at_ingress = ingress; #endif } #ifdef CONFIG_NET_XGRESS static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) { struct bpf_mprog_bundle *bundle = entry->parent; return container_of(bundle, struct tcx_entry, bundle); } static inline struct tcx_link *tcx_link(const struct bpf_link *link) { return container_of(link, struct tcx_link, link); } void tcx_inc(void); void tcx_dec(void); static inline void tcx_entry_sync(void) { /* bpf_mprog_entry got a/b swapped, therefore ensure that * there are no inflight users on the old one anymore. */ synchronize_rcu(); } static inline void tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, bool ingress) { ASSERT_RTNL(); if (ingress) rcu_assign_pointer(dev->tcx_ingress, entry); else rcu_assign_pointer(dev->tcx_egress, entry); } static inline struct bpf_mprog_entry * tcx_entry_fetch(struct net_device *dev, bool ingress) { ASSERT_RTNL(); if (ingress) return rcu_dereference_rtnl(dev->tcx_ingress); else return rcu_dereference_rtnl(dev->tcx_egress); } static inline struct bpf_mprog_entry *tcx_entry_create(void) { struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); return &tcx->bundle.a; } return NULL; } static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { kfree_rcu(tcx_entry(entry), rcu); } static inline struct bpf_mprog_entry * tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) { struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); *created = false; if (!entry) { entry = tcx_entry_create(); if (!entry) return NULL; *created = true; } return entry; } static inline void tcx_skeys_inc(bool ingress) { tcx_inc(); if (ingress) net_inc_ingress_queue(); else net_inc_egress_queue(); } static inline void tcx_skeys_dec(bool ingress) { if (ingress) net_dec_ingress_queue(); else net_dec_egress_queue(); tcx_dec(); } static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, const bool active) { ASSERT_RTNL(); tcx_entry(entry)->miniq_active = active; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; } static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, int code) { switch (code) { case TCX_PASS: skb->tc_index = qdisc_skb_cb(skb)->tc_classid; fallthrough; case TCX_DROP: case TCX_REDIRECT: return code; case TCX_NEXT: default: return TCX_NEXT; } } #endif /* CONFIG_NET_XGRESS */ #if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); void tcx_uninstall(struct net_device *dev, bool ingress); int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); static inline void dev_tcx_uninstall(struct net_device *dev) { ASSERT_RTNL(); tcx_uninstall(dev, true); tcx_uninstall(dev, false); } #else static inline int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline void dev_tcx_uninstall(struct net_device *dev) { } #endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ #endif /* __NET_TCX_H */
55 55 82 27 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 /* * Constant-time equality testing of memory regions. * * Authors: * * James Yonan <james@openvpn.net> * Daniel Borkmann <dborkman@redhat.com> * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * BSD LICENSE * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of OpenVPN Technologies nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <asm/unaligned.h> #include <crypto/algapi.h> #include <linux/module.h> /* Generic path for arbitrary size */ static inline unsigned long __crypto_memneq_generic(const void *a, const void *b, size_t size) { unsigned long neq = 0; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) while (size >= sizeof(unsigned long)) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); a += sizeof(unsigned long); b += sizeof(unsigned long); size -= sizeof(unsigned long); } #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ while (size > 0) { neq |= *(unsigned char *)a ^ *(unsigned char *)b; OPTIMIZER_HIDE_VAR(neq); a += 1; b += 1; size -= 1; } return neq; } /* Loop-free fast-path for frequently used 16-byte size */ static inline unsigned long __crypto_memneq_16(const void *a, const void *b) { unsigned long neq = 0; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (sizeof(unsigned long) == 8) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned long *)(a + 8)) ^ get_unaligned((unsigned long *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); } else if (sizeof(unsigned int) == 4) { neq |= get_unaligned((unsigned int *)a) ^ get_unaligned((unsigned int *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 4)) ^ get_unaligned((unsigned int *)(b + 4)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 8)) ^ get_unaligned((unsigned int *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 12)) ^ get_unaligned((unsigned int *)(b + 12)); OPTIMIZER_HIDE_VAR(neq); } else #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ { neq |= *(unsigned char *)(a) ^ *(unsigned char *)(b); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+1) ^ *(unsigned char *)(b+1); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+2) ^ *(unsigned char *)(b+2); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+3) ^ *(unsigned char *)(b+3); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+4) ^ *(unsigned char *)(b+4); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+5) ^ *(unsigned char *)(b+5); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+6) ^ *(unsigned char *)(b+6); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+7) ^ *(unsigned char *)(b+7); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+8) ^ *(unsigned char *)(b+8); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+9) ^ *(unsigned char *)(b+9); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+10) ^ *(unsigned char *)(b+10); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+11) ^ *(unsigned char *)(b+11); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+12) ^ *(unsigned char *)(b+12); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+13) ^ *(unsigned char *)(b+13); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+14) ^ *(unsigned char *)(b+14); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+15) ^ *(unsigned char *)(b+15); OPTIMIZER_HIDE_VAR(neq); } return neq; } /* Compare two areas of memory without leaking timing information, * and with special optimizations for common sizes. Users should * not call this function directly, but should instead use * crypto_memneq defined in crypto/algapi.h. */ noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size) { switch (size) { case 16: return __crypto_memneq_16(a, b); default: return __crypto_memneq_generic(a, b, size); } } EXPORT_SYMBOL(__crypto_memneq);
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "leds.h" DEFINE_LED_TRIGGER(bt_power_led_trigger); struct hci_basic_led_trigger { struct led_trigger led_trigger; struct hci_dev *hdev; }; #define to_hci_basic_led_trigger(arg) container_of(arg, \ struct hci_basic_led_trigger, led_trigger) void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) { if (hdev->power_led) led_trigger_event(hdev->power_led, enabled ? LED_FULL : LED_OFF); if (!enabled) { struct hci_dev *d; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (test_bit(HCI_UP, &d->flags)) enabled = true; } read_unlock(&hci_dev_list_lock); } led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); } static int power_activate(struct led_classdev *led_cdev) { struct hci_basic_led_trigger *htrig; bool powered; htrig = to_hci_basic_led_trigger(led_cdev->trigger); powered = test_bit(HCI_UP, &htrig->hdev->flags); led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF); return 0; } static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, int (*activate)(struct led_classdev *led_cdev), const char *name) { struct hci_basic_led_trigger *htrig; htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL); if (!htrig) return NULL; htrig->hdev = hdev; htrig->led_trigger.activate = activate; htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s-%s", hdev->name, name); if (!htrig->led_trigger.name) goto err_alloc; if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger)) goto err_register; return &htrig->led_trigger; err_register: devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name); err_alloc: devm_kfree(&hdev->dev, htrig); return NULL; } void hci_leds_init(struct hci_dev *hdev) { /* initialize power_led */ hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); } void bt_leds_init(void) { led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger); } void bt_leds_cleanup(void) { led_trigger_unregister_simple(bt_power_led_trigger); }
5 5 2 4 9 9 5 5 5 5 5 5 5 5 5 5 5 4 4 1 1 4 3 2 2 1 9 9 8 8 7 7 8 6 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * CCM: Counter with CBC-MAC * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct ccm_instance_ctx { struct crypto_skcipher_spawn ctr; struct crypto_ahash_spawn mac; }; struct crypto_ccm_ctx { struct crypto_ahash *mac; struct crypto_skcipher *ctr; }; struct crypto_rfc4309_ctx { struct crypto_aead *child; u8 nonce[3]; }; struct crypto_rfc4309_req_ctx { struct scatterlist src[3]; struct scatterlist dst[3]; struct aead_request subreq; }; struct crypto_ccm_req_priv_ctx { u8 odata[16]; u8 idata[16]; u8 auth_tag[16]; u32 flags; struct scatterlist src[3]; struct scatterlist dst[3]; union { struct ahash_request ahreq; struct skcipher_request skreq; }; }; struct cbcmac_tfm_ctx { struct crypto_cipher *child; }; struct cbcmac_desc_ctx { unsigned int len; u8 dg[]; }; static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx( struct aead_request *req) { unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); } static int set_msg_len(u8 *block, unsigned int msglen, int csize) { __be32 data; memset(block, 0, csize); block += csize; if (csize >= 4) csize = 4; else if (msglen > (1 << (8 * csize))) return -EOVERFLOW; data = cpu_to_be32(msglen); memcpy(block - csize, (u8 *)&data + 4 - csize, csize); return 0; } static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_skcipher *ctr = ctx->ctr; struct crypto_ahash *mac = ctx->mac; int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); if (err) return err; crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_ahash_setkey(mac, key, keylen); } static int crypto_ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 6: case 8: case 10: case 12: case 14: case 16: break; default: return -EINVAL; } return 0; } static int format_input(u8 *info, struct aead_request *req, unsigned int cryptlen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int lp = req->iv[0]; unsigned int l = lp + 1; unsigned int m; m = crypto_aead_authsize(aead); memcpy(info, req->iv, 16); /* format control info per RFC 3610 and * NIST Special Publication 800-38C */ *info |= (8 * ((m - 2) / 2)); if (req->assoclen) *info |= 64; return set_msg_len(info + 16 - l, cryptlen, l); } static int format_adata(u8 *adata, unsigned int a) { int len = 0; /* add control info for associated data * RFC 3610 and NIST Special Publication 800-38C */ if (a < 65280) { *(__be16 *)adata = cpu_to_be16(a); len = 2; } else { *(__be16 *)adata = cpu_to_be16(0xfffe); *(__be32 *)&adata[2] = cpu_to_be32(a); len = 6; } return len; } static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain, unsigned int cryptlen) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct ahash_request *ahreq = &pctx->ahreq; unsigned int assoclen = req->assoclen; struct scatterlist sg[3]; u8 *odata = pctx->odata; u8 *idata = pctx->idata; int ilen, err; /* format control data for input */ err = format_input(odata, req, cryptlen); if (err) goto out; sg_init_table(sg, 3); sg_set_buf(&sg[0], odata, 16); /* format associated data and compute into mac */ if (assoclen) { ilen = format_adata(idata, assoclen); sg_set_buf(&sg[1], idata, ilen); sg_chain(sg, 3, req->src); } else { ilen = 0; sg_chain(sg, 2, req->src); } ahash_request_set_tfm(ahreq, ctx->mac); ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL); ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16); err = crypto_ahash_init(ahreq); if (err) goto out; err = crypto_ahash_update(ahreq); if (err) goto out; /* we need to pad the MAC input to a round multiple of the block size */ ilen = 16 - (assoclen + ilen) % 16; if (ilen < 16) { memset(idata, 0, ilen); sg_init_table(sg, 2); sg_set_buf(&sg[0], idata, ilen); if (plain) sg_chain(sg, 2, plain); plain = sg; cryptlen += ilen; } ahash_request_set_crypt(ahreq, plain, odata, cryptlen); err = crypto_ahash_finup(ahreq); out: return err; } static void crypto_ccm_encrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); u8 *odata = pctx->odata; if (!err) scatterwalk_map_and_copy(odata, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); aead_request_complete(req, err); } static inline int crypto_ccm_check_iv(const u8 *iv) { /* 2 <= L <= 8, so 1 <= L' <= 7. */ if (1 > iv[0] || iv[0] > 7) return -EINVAL; return 0; } static int crypto_ccm_init_crypt(struct aead_request *req, u8 *tag) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct scatterlist *sg; u8 *iv = req->iv; int err; err = crypto_ccm_check_iv(iv); if (err) return err; pctx->flags = aead_request_flags(req); /* Note: rfc 3610 and NIST 800-38C require counter of * zero to encrypt auth tag. */ memset(iv + 15 - iv[0], 0, iv[0] + 1); sg_init_table(pctx->src, 3); sg_set_buf(pctx->src, tag, 16); sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen); if (sg != pctx->src + 1) sg_chain(pctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(pctx->dst, 3); sg_set_buf(pctx->dst, tag, 16); sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen); if (sg != pctx->dst + 1) sg_chain(pctx->dst, 2, sg); } return 0; } static int crypto_ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int cryptlen = req->cryptlen; u8 *odata = pctx->odata; u8 *iv = req->iv; int err; err = crypto_ccm_init_crypt(req, odata); if (err) return err; err = crypto_ccm_auth(req, sg_next(pctx->src), cryptlen); if (err) return err; dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_encrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; /* copy authtag to end of dst */ scatterwalk_map_and_copy(odata, sg_next(dst), cryptlen, crypto_aead_authsize(aead), 1); return err; } static void crypto_ccm_decrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen - authsize; struct scatterlist *dst; pctx->flags = 0; dst = sg_next(req->src == req->dst ? pctx->src : pctx->dst); if (!err) { err = crypto_ccm_auth(req, dst, cryptlen); if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize)) err = -EBADMSG; } aead_request_complete(req, err); } static int crypto_ccm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen; u8 *authtag = pctx->auth_tag; u8 *odata = pctx->odata; u8 *iv = pctx->idata; int err; cryptlen -= authsize; err = crypto_ccm_init_crypt(req, authtag); if (err) return err; scatterwalk_map_and_copy(authtag, sg_next(pctx->src), cryptlen, authsize, 0); dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; memcpy(iv, req->iv, 16); skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_decrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_decrypt(skreq); if (err) return err; err = crypto_ccm_auth(req, sg_next(dst), cryptlen); if (err) return err; /* verify */ if (crypto_memneq(authtag, odata, authsize)) return -EBADMSG; return err; } static int crypto_ccm_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct ccm_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *mac; struct crypto_skcipher *ctr; unsigned long align; int err; mac = crypto_spawn_ahash(&ictx->mac); if (IS_ERR(mac)) return PTR_ERR(mac); ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_mac; ctx->mac = mac; ctx->ctr = ctr; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + sizeof(struct crypto_ccm_req_priv_ctx) + max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr))); return 0; err_free_mac: crypto_free_ahash(mac); return err; } static void crypto_ccm_exit_tfm(struct crypto_aead *tfm) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->mac); crypto_free_skcipher(ctx->ctr); } static void crypto_ccm_free(struct aead_instance *inst) { struct ccm_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_ahash(&ctx->mac); crypto_drop_skcipher(&ctx->ctr); kfree(inst); } static int crypto_ccm_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *ctr_name, const char *mac_name) { struct skcipher_alg_common *ctr; u32 mask; struct aead_instance *inst; struct ccm_instance_ctx *ictx; struct hash_alg_common *mac; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); if (!inst) return -ENOMEM; ictx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ictx->mac, aead_crypto_instance(inst), mac_name, 0, mask | CRYPTO_ALG_ASYNC); if (err) goto err_free_inst; mac = crypto_spawn_ahash_alg(&ictx->mac); err = -EINVAL; if (strncmp(mac->base.cra_name, "cbcmac(", 7) != 0 || mac->digestsize != 16) goto err_free_inst; err = crypto_grab_skcipher(&ictx->ctr, aead_crypto_instance(inst), ctr_name, 0, mask); if (err) goto err_free_inst; ctr = crypto_spawn_skcipher_alg_common(&ictx->ctr); /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */ err = -EINVAL; if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 || ctr->ivsize != 16 || ctr->base.cra_blocksize != 1) goto err_free_inst; /* ctr and cbcmac must use the same underlying block cipher. */ if (strcmp(ctr->base.cra_name + 4, mac->base.cra_name + 7) != 0) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "ccm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "ccm_base(%s,%s)", ctr->base.cra_driver_name, mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (mac->base.cra_priority + ctr->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = ctr->base.cra_alignmask; inst->alg.ivsize = 16; inst->alg.chunksize = ctr->chunksize; inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_ccm_ctx); inst->alg.init = crypto_ccm_init_tfm; inst->alg.exit = crypto_ccm_exit_tfm; inst->alg.setkey = crypto_ccm_setkey; inst->alg.setauthsize = crypto_ccm_setauthsize; inst->alg.encrypt = crypto_ccm_encrypt; inst->alg.decrypt = crypto_ccm_decrypt; inst->free = crypto_ccm_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_ccm_free(inst); } return err; } static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *cipher_name; char ctr_name[CRYPTO_MAX_ALG_NAME]; char mac_name[CRYPTO_MAX_ALG_NAME]; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_ccm_base_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *ctr_name; const char *mac_name; ctr_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(ctr_name)) return PTR_ERR(ctr_name); mac_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(mac_name)) return PTR_ERR(mac_name); return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 3) return -EINVAL; keylen -= 3; memcpy(ctx->nonce, key + keylen, 3); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4309_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return crypto_aead_setauthsize(ctx->child, authsize); } static struct aead_request *crypto_rfc4309_crypt(struct aead_request *req) { struct crypto_rfc4309_req_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->subreq; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(aead); struct crypto_aead *child = ctx->child; struct scatterlist *sg; u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child), crypto_aead_alignmask(child) + 1); /* L' */ iv[0] = 3; memcpy(iv + 1, ctx->nonce, 3); memcpy(iv + 4, req->iv, 8); scatterwalk_map_and_copy(iv + 16, req->src, 0, req->assoclen - 8, 0); sg_init_table(rctx->src, 3); sg_set_buf(rctx->src, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen); if (sg != rctx->src + 1) sg_chain(rctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(rctx->dst, 3); sg_set_buf(rctx->dst, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen); if (sg != rctx->dst + 1) sg_chain(rctx->dst, 2, sg); } aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, rctx->src, req->src == req->dst ? rctx->src : rctx->dst, req->cryptlen, iv); aead_request_set_ad(subreq, req->assoclen - 8); return subreq; } static int crypto_rfc4309_encrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_encrypt(req); } static int crypto_rfc4309_decrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_decrypt(req); } static int crypto_rfc4309_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_aead_spawn *spawn = aead_instance_ctx(inst); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4309_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + 32); return 0; } static void crypto_rfc4309_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4309_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } static int crypto_rfc4309_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct crypto_aead_spawn *spawn; struct aead_alg *alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; /* We only support 16-byte blocks. */ if (crypto_aead_alg_ivsize(alg) != 16) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = 8; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4309_ctx); inst->alg.init = crypto_rfc4309_init_tfm; inst->alg.exit = crypto_rfc4309_exit_tfm; inst->alg.setkey = crypto_rfc4309_setkey; inst->alg.setauthsize = crypto_rfc4309_setauthsize; inst->alg.encrypt = crypto_rfc4309_encrypt; inst->alg.decrypt = crypto_rfc4309_decrypt; inst->free = crypto_rfc4309_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4309_free(inst); } return err; } static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent); return crypto_cipher_setkey(ctx->child, inkey, keylen); } static int crypto_cbcmac_digest_init(struct shash_desc *pdesc) { struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); int bs = crypto_shash_digestsize(pdesc->tfm); ctx->len = 0; memset(ctx->dg, 0, bs); return 0; } static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); while (len > 0) { unsigned int l = min(len, bs - ctx->len); crypto_xor(&ctx->dg[ctx->len], p, l); ctx->len +=l; len -= l; p += l; if (ctx->len == bs) { crypto_cipher_encrypt_one(tfm, ctx->dg, ctx->dg); ctx->len = 0; } } return 0; } static int crypto_cbcmac_digest_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); if (ctx->len) crypto_cipher_encrypt_one(tfm, ctx->dg, ctx->dg); memcpy(out, ctx->dg, bs); return 0; } static int cbcmac_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void cbcmac_exit_tfm(struct crypto_tfm *tfm) { struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = sizeof(struct cbcmac_desc_ctx) + alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx); inst->alg.base.cra_init = cbcmac_init_tfm; inst->alg.base.cra_exit = cbcmac_exit_tfm; inst->alg.init = crypto_cbcmac_digest_init; inst->alg.update = crypto_cbcmac_digest_update; inst->alg.final = crypto_cbcmac_digest_final; inst->alg.setkey = crypto_cbcmac_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_ccm_tmpls[] = { { .name = "cbcmac", .create = cbcmac_create, .module = THIS_MODULE, }, { .name = "ccm_base", .create = crypto_ccm_base_create, .module = THIS_MODULE, }, { .name = "ccm", .create = crypto_ccm_create, .module = THIS_MODULE, }, { .name = "rfc4309", .create = crypto_rfc4309_create, .module = THIS_MODULE, }, }; static int __init crypto_ccm_module_init(void) { return crypto_register_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } static void __exit crypto_ccm_module_exit(void) { crypto_unregister_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } subsys_initcall(crypto_ccm_module_init); module_exit(crypto_ccm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Counter with CBC MAC"); MODULE_ALIAS_CRYPTO("ccm_base"); MODULE_ALIAS_CRYPTO("rfc4309"); MODULE_ALIAS_CRYPTO("ccm"); MODULE_ALIAS_CRYPTO("cbcmac"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
4 2400 2408 6 229 7 6 6 6 6 7 7 7 7 7 6 7 7 6 6 7 6 6 813 822 7 7 1 1 10 9 10 10 5 5 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 22488 22488 811 824 6 810 814 806 808 229 824 810 805 809 805 229 229 809 807 1671 1670 1704 1671 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { int cpu; blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); if (!blkcg->lhead) return -ENOMEM; for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); return 0; } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { struct cgroup_subsys_state *css; css = kthread_blkcg(); if (css) return css; return task_css(current, io_cgrp_id); } static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion * of the list blkg->q_node is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { if (!blkg) return; /* * Both ->pd_free_fn() and request queue's release handler may * sleep, so free us by scheduling one work func */ INIT_WORK(&blkg->free_work, blkg_free_workfn); schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg *blkcg = blkg->blkcg; int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif /* * Flush all the non-empty percpu lockless lists before releasing * us, given these stat belongs to us. * * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); bio_list_merge(&bios, &blkg->async_bios); bio_list_init(&blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; if (blkg->parent) { spin_lock(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); } else { /* never bounce for the root cgroup */ submit_bio(bio); } } EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); static int __init blkcg_punt_bio_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { if (!bio || !bio->bi_blkg) return NULL; return &bio->bi_blkg->blkcg->css; } EXPORT_SYMBOL_GPL(bio_blkcg_css); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); #endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; pd->online = false; } return blkg; out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); out_free_blkg: kfree(blkg); return NULL; } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i]) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); blkg->pd[i]->online = true; } } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* * blkg stays on the queue list until blkg_free_workfn(), see details in * blkg_free_workfn(), hence this function can be called from * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before * blkg_free_workfn(). */ if (hlist_unhashed(&blkg->blkcg_node)) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && blkg->pd[i]->online) { blkg->pd[i]->online = false; if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { for_each_possible_cpu(cpu) { struct blkg_iostat_set *bis = per_cpu_ptr(blkg->iostat_cpu, cpu); memset(bis, 0, sizeof(*bis)); /* Re-initialize the cleared blkg_iostat_set */ u64_stats_init(&bis->sync); bis->blkg = blkg; } memset(&blkg->iostat, 0, sizeof(blkg->iostat)); u64_stats_init(&blkg->iostat.sync); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { *ctx = (struct blkg_conf_ctx){ .input = input }; } EXPORT_SYMBOL_GPL(blkg_conf_init); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; if (ctx->bdev) return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; input += key_len; if (!isspace(*input)) return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return -ENODEV; } mutex_lock(&bdev->bd_queue->rq_qos_mutex); if (!disk_live(bdev->bd_disk)) { blkdev_put_no_open(bdev); mutex_unlock(&bdev->bd_queue->rq_qos_mutex); return -ENODEV; } ctx->body = input; ctx->bdev = bdev; return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; ret = blkg_conf_open_bdev(ctx); if (ret) return ret; disk = ctx->bdev->bd_disk; q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). */ ret = blk_queue_enter(q, 0); if (ret) goto fail; spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(blkcg, q); if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit_queue; } spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: blk_queue_exit(q); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); fail_exit_queue: blk_queue_exit(q); fail: /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ctx->blkg = NULL; } if (ctx->bdev) { mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; } } EXPORT_SYMBOL_GPL(blkg_conf_exit); static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { struct blkg_iostat delta; unsigned long flags; /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, cur); blkg_iostat_sub(&delta, last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; unsigned long flags; rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; /* * For covering concurrent parent blkg update from blkg_release(). * * When flushing from cgroup, cgroup_rstat_lock is always held, so * this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. */ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; struct blkg_iostat cur; unsigned int seq; WRITE_ONCE(bisc->lqueued, false); /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); blkcg_iostat_update(blkg, &cur, &bisc->last); /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { /* Root-level stats are sourced from system-wide IO stats */ if (cgroup_parent(css->cgroup)) __blkcg_rstat_flush(css_to_blkcg(css), cpu); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, cgroup_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; pol->pd_stat_fn(blkg->pd[i], s); } seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { return &css_to_blkcg(css)->cgwb_list; } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { if (!refcount_dec_and_test(&blkcg->online_pin)) break; blkcg_destroy_blkgs(blkcg); blkcg = blkcg_parent(blkcg); } while (blkcg); } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); free_percpu(blkcg->lhead); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) goto unlock; } if (init_blkcg_llists(blkcg)) goto free_blkcg; for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto free_pd_blkcg; blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_percpu(blkcg->lhead); free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(&parent->css); return 0; } int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; INIT_LIST_HEAD(&q->blkg_list); mutex_init(&q->blkcg_mutex); new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); ret = blk_ioprio_init(disk); if (ret) goto err_destroy_all; ret = blk_throtl_init(disk); if (ret) goto err_ioprio_exit; return 0; err_ioprio_exit: blk_ioprio_exit(disk); err_destroy_all: blkg_destroy_all(disk); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) put_disk(tsk->throttle_disk); tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; if (blkcg_policy_enabled(q, pol)) return 0; if (queue_is_mq(q)) blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } spin_lock(&blkg->blkcg->lock); pd->blkg = blkg; pd->plid = pol->plid; blkg->pd[pol->plid] = pd; if (pol->pd_init_fn) pol->pd_init_fn(pd); if (pol->pd_online_fn) pol->pd_online_fn(pd); pd->online = true; spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; spin_lock(&blkcg->lock); pd = blkg->pd[pol->plid]; if (pd) { if (pd->online && pol->pd_offline_fn) pol->pd_offline_fn(pd); pd->online = false; pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { struct blkcg *blkcg; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; } /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto err_free_cpds; blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes) WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!disk) return; current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); put_disk(disk); return; out: rcu_read_unlock(); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_disk != disk) { if (test_bit(GD_DEAD, &disk->state)) return; get_device(disk_to_dev(disk)); if (current->throttle_disk) put_disk(current->throttle_disk); current->throttle_disk = disk; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; if (blk_op_is_passthrough(bio->bi_opf)) return; rcu_read_lock(); if (bio->bi_blkg) css = bio_blkcg_css(bio); else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) return; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; /* * If the iostat_cpu isn't in a lockless list, put it into the * list to indicate that a stat update is pending. */ if (!READ_ONCE(bis->lqueued)) { struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); } u64_stats_update_end_irqrestore(&bis->sync, flags); cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } bool blk_cgroup_congested(void) { struct cgroup_subsys_state *css; bool ret = false; rcu_read_lock(); for (css = blkcg_css(); css; css = css->parent) { if (atomic_read(&css->cgroup->congestion_count)) { ret = true; break; } } rcu_read_unlock(); return ret; } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
24 14 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_VXLAN_H #define __NET_VXLAN_H 1 #include <linux/if_vlan.h> #include <linux/rhashtable-types.h> #include <net/udp_tunnel.h> #include <net/dst_metadata.h> #include <net/rtnetlink.h> #include <net/switchdev.h> #include <net/nexthop.h> #define IANA_VXLAN_UDP_PORT 4789 #define IANA_VXLAN_GPE_UDP_PORT 4790 /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * I = VXLAN Network Identifier (VNI) present. */ struct vxlanhdr { __be32 vx_flags; __be32 vx_vni; }; /* VXLAN header flags. */ #define VXLAN_HF_VNI cpu_to_be32(BIT(27)) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) #define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) #define FDB_HASH_BITS 8 #define FDB_HASH_SIZE (1<<FDB_HASH_BITS) /* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R|R|R|C| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) |O| Csum start | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * C = Remote checksum offload bit. When set indicates that the * remote checksum offload data is present. * * O = Offset bit. Indicates the checksum offset relative to * checksum start. * * Csum start = Checksum start divided by two. * * http://tools.ietf.org/html/draft-herbert-vxlan-rco */ /* VXLAN-RCO header flags. */ #define VXLAN_HF_RCO cpu_to_be32(BIT(21)) /* Remote checksum offload header option */ #define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ #define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ #define VXLAN_RCO_SHIFT 1 /* Left shift of start */ #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) #define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* * VXLAN Group Based Policy Extension (VXLAN_F_GBP): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * G = Group Policy ID present. * * D = Don't Learn bit. When set, this bit indicates that the egress * VTEP MUST NOT learn the source address of the encapsulated frame. * * A = Indicates that the group policy has already been applied to * this packet. Policies MUST NOT be applied by devices when the * A bit is set. * * https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, reserved_flags3:3; #else #error "Please fix <asm/byteorder.h>" #endif __be16 policy_id; __be32 vx_vni; }; /* VXLAN-GBP header flags. */ #define VXLAN_HF_GBP cpu_to_be32(BIT(31)) #define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define VXLAN_GBP_DONT_LEARN (BIT(6) << 16) #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) #define VXLAN_GBP_MASK (VXLAN_GBP_DONT_LEARN | VXLAN_GBP_POLICY_APPLIED | \ VXLAN_GBP_ID_MASK) /* * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Ver = Version. Indicates VXLAN GPE protocol version. * * P = Next Protocol Bit. The P bit is set to indicate that the * Next Protocol field is present. * * O = OAM Flag Bit. The O bit is set to indicate that the packet * is an OAM packet. * * Next Protocol = This 8 bit field indicates the protocol header * immediately following the VXLAN GPE header. * * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 */ struct vxlanhdr_gpe { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 oam_flag:1, reserved_flags1:1, np_applied:1, instance_applied:1, version:2, reserved_flags2:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags2:2, version:2, instance_applied:1, np_applied:1, reserved_flags1:1, oam_flag:1; #endif u8 reserved_flags3; u8 reserved_flags4; u8 next_protocol; __be32 vx_vni; }; /* VXLAN-GPE header flags. */ #define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) #define VXLAN_HF_NP cpu_to_be32(BIT(26)) #define VXLAN_HF_OAM cpu_to_be32(BIT(24)) #define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ cpu_to_be32(0xff)) struct vxlan_metadata { u32 gbp; }; /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; refcount_t refcnt; u32 flags; }; union vxlan_addr { struct sockaddr_in sin; struct sockaddr_in6 sin6; struct sockaddr sa; }; struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; u8 offloaded:1; __be32 remote_vni; u32 remote_ifindex; struct net_device *remote_dev; struct list_head list; struct rcu_head rcu; struct dst_cache dst_cache; }; struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; u16 port_min; u16 port_max; u8 tos; u8 ttl; __be32 label; enum ifla_vxlan_label_policy label_policy; u32 flags; unsigned long age_interval; unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; }; enum { VXLAN_VNI_STATS_RX, VXLAN_VNI_STATS_RX_DROPS, VXLAN_VNI_STATS_RX_ERRORS, VXLAN_VNI_STATS_TX, VXLAN_VNI_STATS_TX_DROPS, VXLAN_VNI_STATS_TX_ERRORS, }; struct vxlan_vni_stats { u64 rx_packets; u64 rx_bytes; u64 rx_drops; u64 rx_errors; u64 tx_packets; u64 tx_bytes; u64 tx_drops; u64 tx_errors; }; struct vxlan_vni_stats_pcpu { struct vxlan_vni_stats stats; struct u64_stats_sync syncp; }; struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; }; struct vxlan_vni_node { struct rhash_head vnode; struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head vlist; __be32 vni; union vxlan_addr remote_ip; /* default remote ip for this vni */ struct vxlan_vni_stats_pcpu __percpu *stats; struct rcu_head rcu; }; struct vxlan_vni_group { struct rhashtable vni_hash; struct list_head vni_list; u32 num_vnis; }; /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head next; /* vxlan's per namespace list */ struct vxlan_sock __rcu *vn4_sock; /* listening socket for IPv4 */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock __rcu *vn6_sock; /* listening socket for IPv6 */ #endif struct net_device *dev; struct net *net; /* netns for packet i/o */ struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; spinlock_t hash_lock[FDB_HASH_SIZE]; unsigned int addrcnt; struct gro_cells gro_cells; struct vxlan_config cfg; struct vxlan_vni_group __rcu *vnigrp; struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; struct hlist_head mdb_list; unsigned int mdb_seq; }; #define VXLAN_F_LEARN 0x01 #define VXLAN_F_PROXY 0x02 #define VXLAN_F_RSC 0x04 #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 #define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 #define VXLAN_F_REMCSUM_RX 0x400 #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable */ #define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ VXLAN_F_GPE | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ VXLAN_F_IPV6 | \ VXLAN_F_IPV6_LINKLOCAL | \ VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) { u8 l4_hdr = 0; if (!skb->encapsulation) return features; switch (vlan_get_protocol(skb)) { case htons(ETH_P_IP): l4_hdr = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: return features; } if ((l4_hdr == IPPROTO_UDP) && (skb->inner_protocol_type != ENCAP_TYPE_ETHER || skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } static inline int vxlan_headroom(u32 flags) { /* VXLAN: IP4/6 header + UDP + VXLAN + Ethernet header */ /* VXLAN-GPE: IP4/6 header + UDP + VXLAN */ return (flags & VXLAN_F_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr)) + sizeof(struct udphdr) + sizeof(struct vxlanhdr) + (flags & VXLAN_F_GPE ? 0 : ETH_HLEN); } static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) { return (struct vxlanhdr *)(udp_hdr(skb) + 1); } static inline __be32 vxlan_vni(__be32 vni_field) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni_field >> 8); #else return (__force __be32)((__force u32)(vni_field & VXLAN_VNI_MASK) << 8); #endif } static inline __be32 vxlan_vni_field(__be32 vni) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni << 8); #else return (__force __be32)((__force u32)vni >> 8); #endif } static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; } static inline size_t vxlan_rco_offset(__be32 vni_field) { return (vni_field & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : offsetof(struct tcphdr, check); } static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) { __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); if (offset == offsetof(struct udphdr, check)) vni_field |= VXLAN_RCO_UDP; return vni_field; } static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_any(&ipa->sin6.sin6_addr); else return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); else return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #else /* !IS_ENABLED(CONFIG_IPV6) */ static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #endif /* IS_ENABLED(CONFIG_IPV6) */ static inline bool netif_is_vxlan(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "vxlan"); } struct switchdev_notifier_vxlan_fdb_info { struct switchdev_notifier_info info; /* must be first */ union vxlan_addr remote_ip; __be16 remote_port; __be32 remote_vni; u32 remote_ifindex; u8 eth_addr[ETH_ALEN]; __be32 vni; bool offloaded; bool added_by_user; }; #if IS_ENABLED(CONFIG_VXLAN) int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info); int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni); #else static inline int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { return -ENOENT; } static inline int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { } #endif static inline void vxlan_flag_attr_error(int attrtype, struct netlink_ext_ack *extack) { #define VXLAN_FLAG(flg) \ case IFLA_VXLAN_##flg: \ NL_SET_ERR_MSG_MOD(extack, \ "cannot change " #flg " flag"); \ break switch (attrtype) { VXLAN_FLAG(TTL_INHERIT); VXLAN_FLAG(LEARNING); VXLAN_FLAG(PROXY); VXLAN_FLAG(RSC); VXLAN_FLAG(L2MISS); VXLAN_FLAG(L3MISS); VXLAN_FLAG(COLLECT_METADATA); VXLAN_FLAG(UDP_ZERO_CSUM6_TX); VXLAN_FLAG(UDP_ZERO_CSUM6_RX); VXLAN_FLAG(REMCSUM_TX); VXLAN_FLAG(REMCSUM_RX); VXLAN_FLAG(GBP); VXLAN_FLAG(GPE); VXLAN_FLAG(REMCSUM_NOPARTIAL); default: NL_SET_ERR_MSG_MOD(extack, \ "cannot change flag"); break; } #undef VXLAN_FLAG } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; switch (nhc->nhc_gw_family) { case AF_INET: rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; rdst->remote_ip.sa.sa_family = AF_INET; break; case AF_INET6: rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; rdst->remote_ip.sa.sa_family = AF_INET6; break; } return true; } static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md) { struct vxlanhdr_gbp *gbp; if (!md->gbp) return; gbp = (struct vxlanhdr_gbp *)vxh; vxh->vx_flags |= VXLAN_HF_GBP; if (md->gbp & VXLAN_GBP_DONT_LEARN) gbp->dont_learn = 1; if (md->gbp & VXLAN_GBP_POLICY_APPLIED) gbp->policy_applied = 1; gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } #endif
102 102 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM page_pool #if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_POOL_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #include <net/page_pool/types.h> TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), TP_ARGS(pool, inflight, hold, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(s32, inflight) __field(u32, hold) __field(u32, release) __field(u64, cnt) ), TP_fast_assign( __entry->pool = pool; __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; __entry->cnt = pool->destroy_cnt; ), TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", __entry->pool, __entry->inflight, __entry->hold, __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 release), TP_ARGS(pool, page, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx release=%u", __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 hold), TP_ARGS(pool, page, hold), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx hold=%u", __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); TRACE_EVENT(page_pool_update_nid, TP_PROTO(const struct page_pool *pool, int new_nid), TP_ARGS(pool, new_nid), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(int, pool_nid) __field(int, new_nid) ), TP_fast_assign( __entry->pool = pool; __entry->pool_nid = pool->p.nid; __entry->new_nid = new_nid; ), TP_printk("page_pool=%p pool_nid=%d new_nid=%d", __entry->pool, __entry->pool_nid, __entry->new_nid) ); #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 3 2 11 7 1 22 2 3 17 17 21 1 3 3 2 13 17 83 89 59 61 20 42 44 44 55 6 6 6 60 60 61 59 82 83 57 35 84 84 82 74 10 79 5 81 2 74 10 86 34 33 29 3 27 5 54 3 72 1 71 4 68 8 67 8 7 68 1 61 6 55 7 7 2 8 1 7 1 5 2 3 3 3 5 3 1 4 1 3 1 1 9 19 11 9 42 21 21 3 3 1 2 25 24 1 1 10 13 13 1 2 10 8 3 3 28 13 35 92 37 52 1 83 87 23 63 89 163 166 125 50 33 32 21 7 7 1 7 7 64 43 28 2 63 63 1 6 56 54 43 18 44 2 49 48 40 13 9 38 1 26 12 8 5 19 30 28 19 3 1 17 2 1 11 4 4 4 4 4 34 1 20 13 4 4 39 95 42 8 5 5 8 5 7 12 29 9 9 20 1 6 13 15 7 4 14 14 14 15 9 7 17 16 16 3 3 3 3 3 15 14 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); fdput(f); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto put_map; } if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); put_prog: bpf_prog_put(prog); put_map: fdput(f); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk; int err = 0; spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } return 0; } static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct bpf_prog **pprog; int ret; ret = sock_map_prog_lookup(map, &pprog, which); if (ret) return ret; if (old) return psock_replace_prog(pprog, prog, old); psock_set_prog(pprog, prog); return 0; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; fdput(f); return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); saved_close = READ_ONCE(sk->sk_prot)->close; } else { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init);
82 10 74 72 19 57 50 48 2 5 5 42 198 196 199 50 25 120 1 167 20 145 94 54 1 5 141 1 1 7 3 1 3 2 2 1 1 1 1 3 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2010 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->type; tuple->src.u.icmp.id = hp->un.echo.id; tuple->dst.u.icmp.code = hp->code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; if (state->pf != NFPROTO_IPV4) return -NF_ACCEPT; if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } if (!timeout) timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Check inner header is related to any of the existing connections */ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; union nf_inet_addr *ct_daddr; enum ip_conntrack_dir dir; struct nf_conn *ct; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, dataoff, state->pf, state->net, &origtuple)) return -NF_ACCEPT; /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&innertuple, &origtuple)) return -NF_ACCEPT; h = nf_conntrack_find_get(state->net, zone, &innertuple); if (!h) return -NF_ACCEPT; /* Consider: A -> T (=This machine) -> B * Conntrack entry will look like this: * Original: A->B * Reply: B->T (SNAT case) OR A * * When this function runs, we got packet that looks like this: * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..). * * Above nf_conntrack_find_get() makes lookup based on inner_hdr, * so we should expect that destination of the found connection * matches outer header destination address. * * In above example, we can consider these two cases: * 1. Error coming in reply direction from B or M (middle box) to * T (SNAT case) or A. * Inner saddr will be B, dst will be T or A. * The found conntrack will be reply tuple (B->T/A). * 2. Error coming in original direction from A or M to B. * Inner saddr will be A, inner daddr will be B. * The found conntrack will be original tuple (A->B). * * In both cases, conntrack[dir].dst == inner.dst. * * A bogus packet could look like this: * Inner: B->T * Outer: B->X (other machine reachable by T). * * In this case, lookup yields connection A->B and will * set packet from B->X as *RELATED*, even though no connection * from X was ever seen. */ ct = nf_ct_tuplehash_to_ctrack(h); dir = NF_CT_DIRECTION(h); ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { if (state->pf == AF_INET) { nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI4 != inner %pI4", &outer_daddr->ip, &ct_daddr->ip); } else if (state->pf == AF_INET6) { nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI6 != inner %pI6", &outer_daddr->ip6, &ct_daddr->ip6); } nf_ct_put(ct); return -NF_ACCEPT; } ctinfo = IP_CT_RELATED; if (dir == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ nf_ct_set(skb, ct, ctinfo); return NF_ACCEPT; } static void icmp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; } /* See nf_conntrack_proto_tcp.c */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) { icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. */ if (icmph->type > NR_ICMP_TYPES) { icmp_error_log(skb, state, "invalid icmp type"); return -NF_ACCEPT; } /* Need to track icmp error message? */ if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); outer_daddr.ip = ip_hdr(skb)->daddr; dataoff += sizeof(*icmph); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMP, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { if (!tb[CTA_PROTO_ICMP_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { if (!tb[CTA_PROTO_ICMP_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { if (!tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); } return 0; } static unsigned int icmp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } return 0; } static int icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmp_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmp_pernet(net); in->timeout = nf_ct_icmp_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
5 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ #include "main.h" #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table * @data: mac address of the originator node * @size: the size of the hash table * * Return: the hash index where the object represented by @data should be * stored at. */ static inline u32 batadv_choose_orig(const void *data, u32 size) { u32 hash = 0; hash = jhash(data, ETH_ALEN, hash); return hash % size; } struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); /** * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ static inline void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { if (!orig_vlan) return; kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } /** * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ static inline void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (!neigh_ifinfo) return; kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ static inline void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { if (!hardif_neigh) return; kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { if (!neigh_node) return; kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ static inline void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { if (!orig_ifinfo) return; kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** * batadv_orig_node_put() - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) { if (!orig_node) return; kref_put(&orig_node->refcount, batadv_orig_node_release); } #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor network mediation definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NET_H #define __AA_NET_H #include <net/sock.h> #include <linux/path.h> #include "apparmorfs.h" #include "label.h" #include "perms.h" #include "policy.h" #define AA_MAY_SEND AA_MAY_WRITE #define AA_MAY_RECEIVE AA_MAY_READ #define AA_MAY_SHUTDOWN AA_MAY_DELETE #define AA_MAY_CONNECT AA_MAY_OPEN #define AA_MAY_ACCEPT 0x00100000 #define AA_MAY_BIND 0x00200000 #define AA_MAY_LISTEN 0x00400000 #define AA_MAY_SETOPT 0x01000000 #define AA_MAY_GETOPT 0x02000000 #define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) #define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ AA_MAY_MPROT) #define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ AA_MAY_ACCEPT) struct aa_sk_ctx { struct aa_label *label; struct aa_label *peer; }; #define SK_CTX(X) ((X)->sk_security) static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) { return sk->sk_security; } #define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P) \ struct lsm_network_audit NAME ## _net = { .sk = (SK), \ .family = (F)}; \ DEFINE_AUDIT_DATA(NAME, \ ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ LSM_AUDIT_DATA_NONE, \ AA_CLASS_NET, \ OP); \ NAME.common.u.net = &(NAME ## _net); \ NAME.net.type = (T); \ NAME.net.protocol = (P) #define DEFINE_AUDIT_SK(NAME, OP, SK) \ DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type, \ (SK)->sk_protocol) #define af_select(FAMILY, FN, DEF_FN) \ ({ \ int __e; \ switch ((FAMILY)) { \ default: \ __e = DEF_FN; \ } \ __e; \ }) struct aa_secmark { u8 audit; u8 deny; u32 secid; char *label; }; extern struct aa_sfs_entry aa_sfs_entry_network[]; void audit_net_cb(struct audit_buffer *ab, void *va); int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type); int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol); static inline int aa_profile_af_sk_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, struct sock *sk) { return aa_profile_af_perm(profile, ad, request, sk->sk_family, sk->sk_type); } int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct socket *sock); int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk); #endif /* __AA_NET_H */
40 74 37 35 26 16 3 16 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 /* SPDX-License-Identifier: GPL-2.0-only */ /* * mac80211 <-> driver interface * * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #ifndef MAC80211_H #define MAC80211_H #include <linux/bug.h> #include <linux/kernel.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/ieee80211.h> #include <linux/lockdep.h> #include <net/cfg80211.h> #include <net/codel.h> #include <net/ieee80211_radiotap.h> #include <asm/unaligned.h> /** * DOC: Introduction * * mac80211 is the Linux stack for 802.11 hardware that implements * only partial functionality in hard- or firmware. This document * defines the interface between mac80211 and low-level hardware * drivers. */ /** * DOC: Calling mac80211 from interrupts * * Only ieee80211_tx_status_irqsafe() and ieee80211_rx_irqsafe() can be * called in hardware interrupt context. The low-level driver must not call any * other functions in hardware interrupt context. If there is a need for such * call, the low-level driver should first ACK the interrupt and perform the * IEEE 802.11 code call after this, e.g. from a scheduled workqueue or even * tasklet function. * * NOTE: If the driver opts to use the _irqsafe() functions, it may not also * use the non-IRQ-safe functions! */ /** * DOC: Warning * * If you're reading this document and not the header file itself, it will * be incomplete because not all documentation has been converted yet. */ /** * DOC: Frame format * * As a general rule, when frames are passed between mac80211 and the driver, * they start with the IEEE 802.11 header and include the same octets that are * sent over the air except for the FCS which should be calculated by the * hardware. * * There are, however, various exceptions to this rule for advanced features: * * The first exception is for hardware encryption and decryption offload * where the IV/ICV may or may not be generated in hardware. * * Secondly, when the hardware handles fragmentation, the frame handed to * the driver from mac80211 is the MSDU, not the MPDU. */ /** * DOC: mac80211 workqueue * * mac80211 provides its own workqueue for drivers and internal mac80211 use. * The workqueue is a single threaded workqueue and can only be accessed by * helpers for sanity checking. Drivers must ensure all work added onto the * mac80211 workqueue should be cancelled on the driver stop() callback. * * mac80211 will flush the workqueue upon interface removal and during * suspend. * * All work performed on the mac80211 workqueue must not acquire the RTNL lock. * */ /** * DOC: mac80211 software tx queueing * * mac80211 uses an intermediate queueing implementation, designed to allow the * driver to keep hardware queues short and to provide some fairness between * different stations/interfaces. * * Drivers must provide the .wake_tx_queue driver operation by either * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and * a single per-vif queue for multicast data frames. * * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * * The driver can't access the internal TX queues (iTXQs) directly. * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue * driver op. * Drivers implementing a custom .wake_tx_queue op can get them by calling * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to * obtain the next queue to pull frames from, the driver calls * ieee80211_next_txq(). The driver is then expected to return the txq using * ieee80211_return_txq(). * * For AP powersave TIM handling, the driver only needs to indicate if it has * buffered packets in the driver specific data structures by calling * ieee80211_sta_set_buffered(). For frames buffered in the ieee80211_txq * struct, mac80211 sets the appropriate TIM PVB bits and calls * .release_buffered_frames(). * In that callback the driver is therefore expected to release its own * buffered frames and afterwards also frames from the ieee80211_txq (obtained * via the usual ieee80211_tx_dequeue). */ /** * DOC: HW timestamping * * Timing Measurement and Fine Timing Measurement require accurate timestamps * of the action frames TX/RX and their respective acks. * * To report hardware timestamps for Timing Measurement or Fine Timing * Measurement frame RX, the low level driver should set the SKB's hwtstamp * field to the frame RX timestamp and report the ack TX timestamp in the * ieee80211_rx_status struct. * * Similarly, to report hardware timestamps for Timing Measurement or Fine * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field * to the frame TX timestamp and report the ack RX timestamp in the * ieee80211_tx_status struct. */ struct device; /** * enum ieee80211_max_queues - maximum number of queues * * @IEEE80211_MAX_QUEUES: Maximum number of regular device queues. * @IEEE80211_MAX_QUEUE_MAP: bitmap with maximum queues set */ enum ieee80211_max_queues { IEEE80211_MAX_QUEUES = 16, IEEE80211_MAX_QUEUE_MAP = BIT(IEEE80211_MAX_QUEUES) - 1, }; #define IEEE80211_INVAL_HW_QUEUE 0xff /** * enum ieee80211_ac_numbers - AC numbers as used in mac80211 * @IEEE80211_AC_VO: voice * @IEEE80211_AC_VI: video * @IEEE80211_AC_BE: best effort * @IEEE80211_AC_BK: background */ enum ieee80211_ac_numbers { IEEE80211_AC_VO = 0, IEEE80211_AC_VI = 1, IEEE80211_AC_BE = 2, IEEE80211_AC_BK = 3, }; /** * struct ieee80211_tx_queue_params - transmit queue configuration * * The information provided in this structure is required for QoS * transmit queue configuration. Cf. IEEE 802.11 7.3.2.29. * * @aifs: arbitration interframe space [0..255] * @cw_min: minimum contention window [a value of the form * 2^n-1 in the range 1..32767] * @cw_max: maximum contention window [like @cw_min] * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled * @acm: is mandatory admission control required for the access category * @uapsd: is U-APSD mode enabled for the queue * @mu_edca: is the MU EDCA configured * @mu_edca_param_rec: MU EDCA Parameter Record for HE */ struct ieee80211_tx_queue_params { u16 txop; u16 cw_min; u16 cw_max; u8 aifs; bool acm; bool uapsd; bool mu_edca; struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec; }; struct ieee80211_low_level_stats { unsigned int dot11ACKFailureCount; unsigned int dot11RTSFailureCount; unsigned int dot11FCSErrorCount; unsigned int dot11RTSSuccessCount; }; /** * enum ieee80211_chanctx_change - change flag for channel context * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; /** * struct ieee80211_chan_req - A channel "request" * @oper: channel definition to use for operation * @ap: the channel definition of the AP, if any * (otherwise the chan member is %NULL) */ struct ieee80211_chan_req { struct cfg80211_chan_def oper; struct cfg80211_chan_def ap; }; /** * struct ieee80211_chanctx_conf - channel context that vifs may be tuned to * * This is the driver-visible part. The ieee80211_chanctx * that contains it is visible in mac80211 only. * * @def: the channel definition * @min_def: the minimum channel definition currently required. * @ap: the channel definition the AP actually is operating as, * for use with (wider bandwidth) OFDMA * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled * after RTS/CTS handshake to receive SMPS MIMO transmissions; * this will always be >= @rx_chains_static. * @radar_enabled: whether radar detection is enabled on this channel. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. */ struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; struct cfg80211_chan_def ap; u8 rx_chains_static, rx_chains_dynamic; bool radar_enabled; u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_chanctx_switch_mode - channel context switch mode * @CHANCTX_SWMODE_REASSIGN_VIF: Both old and new contexts already * exist (and will continue to exist), but the virtual interface * needs to be switched from one to the other. * @CHANCTX_SWMODE_SWAP_CONTEXTS: The old context exists but will stop * to exist with this call, the new context doesn't exist but * will be active after this call, the virtual interface switches * from the old to the new (note that the driver may of course * implement this as an on-the-fly chandef switch of the existing * hardware context, but the mac80211 pointer for the old context * will cease to exist and only the new one will later be used * for changes/removal.) */ enum ieee80211_chanctx_switch_mode { CHANCTX_SWMODE_REASSIGN_VIF, CHANCTX_SWMODE_SWAP_CONTEXTS, }; /** * struct ieee80211_vif_chanctx_switch - vif chanctx switch information * * This is structure is used to pass information about a vif that * needs to switch from one chanctx to another. The * &ieee80211_chanctx_switch_mode defines how the switch should be * done. * * @vif: the vif that should be switched from old_ctx to new_ctx * @link_conf: the link conf that's switching * @old_ctx: the old context to which the vif was assigned * @new_ctx: the new context to which the vif must be assigned */ struct ieee80211_vif_chanctx_switch { struct ieee80211_vif *vif; struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *old_ctx; struct ieee80211_chanctx_conf *new_ctx; }; /** * enum ieee80211_bss_change - BSS change notification flags * * These flags are used with the bss_info_changed(), link_info_changed() * and vif_cfg_changed() callbacks to indicate which parameter(s) changed. * * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated), * also implies a change in the AID. * @BSS_CHANGED_ERP_CTS_PROT: CTS protection changed * @BSS_CHANGED_ERP_PREAMBLE: preamble changed * @BSS_CHANGED_ERP_SLOT: slot timing changed * @BSS_CHANGED_HT: 802.11n parameters changed * @BSS_CHANGED_BASIC_RATES: Basic rateset changed * @BSS_CHANGED_BEACON_INT: Beacon interval changed * @BSS_CHANGED_BSSID: BSSID changed, for whatever * reason (IBSS and managed mode) * @BSS_CHANGED_BEACON: Beacon data changed, retrieve * new beacon (beaconing modes) * @BSS_CHANGED_BEACON_ENABLED: Beaconing should be * enabled/disabled (beaconing modes) * @BSS_CHANGED_CQM: Connection quality monitor config changed * @BSS_CHANGED_IBSS: IBSS join status changed * @BSS_CHANGED_ARP_FILTER: Hardware ARP filter address list or state changed. * @BSS_CHANGED_QOS: QoS for this association was enabled/disabled. Note * that it is only ever disabled for station mode. * @BSS_CHANGED_IDLE: Idle changed for this BSS/interface. * @BSS_CHANGED_SSID: SSID changed for this BSS (AP and IBSS mode) * @BSS_CHANGED_AP_PROBE_RESP: Probe Response changed for this BSS (AP mode) * @BSS_CHANGED_PS: PS changed for this BSS (STA mode) * @BSS_CHANGED_TXPOWER: TX power setting changed for this interface * @BSS_CHANGED_P2P_PS: P2P powersave settings (CTWindow, opportunistic PS) * changed * @BSS_CHANGED_BEACON_INFO: Data from the AP's beacon became available: * currently dtim_period only is under consideration. * @BSS_CHANGED_BANDWIDTH: The bandwidth used by this interface changed, * note that this is only called when it changes after the channel * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, BSS_CHANGED_ERP_CTS_PROT = 1<<1, BSS_CHANGED_ERP_PREAMBLE = 1<<2, BSS_CHANGED_ERP_SLOT = 1<<3, BSS_CHANGED_HT = 1<<4, BSS_CHANGED_BASIC_RATES = 1<<5, BSS_CHANGED_BEACON_INT = 1<<6, BSS_CHANGED_BSSID = 1<<7, BSS_CHANGED_BEACON = 1<<8, BSS_CHANGED_BEACON_ENABLED = 1<<9, BSS_CHANGED_CQM = 1<<10, BSS_CHANGED_IBSS = 1<<11, BSS_CHANGED_ARP_FILTER = 1<<12, BSS_CHANGED_QOS = 1<<13, BSS_CHANGED_IDLE = 1<<14, BSS_CHANGED_SSID = 1<<15, BSS_CHANGED_AP_PROBE_RESP = 1<<16, BSS_CHANGED_PS = 1<<17, BSS_CHANGED_TXPOWER = 1<<18, BSS_CHANGED_P2P_PS = 1<<19, BSS_CHANGED_BEACON_INFO = 1<<20, BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, BSS_CHANGED_MU_GROUPS = 1<<23, BSS_CHANGED_KEEP_ALIVE = 1<<24, BSS_CHANGED_MCAST_RATE = 1<<25, BSS_CHANGED_FTM_RESPONDER = 1<<26, BSS_CHANGED_TWT = 1<<27, BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), BSS_CHANGED_MLD_TTLM = BIT_ULL(34), /* when adding here, make sure to change ieee80211_reconfig */ }; /* * The maximum number of IPv4 addresses listed for ARP filtering. If the number * of addresses for an interface increase beyond this value, hardware ARP * filtering will be disabled. */ #define IEEE80211_BSS_ARP_ADDR_LIST_LEN 4 /** * enum ieee80211_event_type - event to be notified to the low level driver * @RSSI_EVENT: AP's rssi crossed the a threshold set by the driver. * @MLME_EVENT: event related to MLME * @BAR_RX_EVENT: a BAR was received * @BA_FRAME_TIMEOUT: Frames were released from the reordering buffer because * they timed out. This won't be called for each frame released, but only * once each time the timeout triggers. */ enum ieee80211_event_type { RSSI_EVENT, MLME_EVENT, BAR_RX_EVENT, BA_FRAME_TIMEOUT, }; /** * enum ieee80211_rssi_event_data - relevant when event type is %RSSI_EVENT * @RSSI_EVENT_HIGH: AP's rssi went below the threshold set by the driver. * @RSSI_EVENT_LOW: AP's rssi went above the threshold set by the driver. */ enum ieee80211_rssi_event_data { RSSI_EVENT_HIGH, RSSI_EVENT_LOW, }; /** * struct ieee80211_rssi_event - data attached to an %RSSI_EVENT * @data: See &enum ieee80211_rssi_event_data */ struct ieee80211_rssi_event { enum ieee80211_rssi_event_data data; }; /** * enum ieee80211_mlme_event_data - relevant when event type is %MLME_EVENT * @AUTH_EVENT: the MLME operation is authentication * @ASSOC_EVENT: the MLME operation is association * @DEAUTH_RX_EVENT: deauth received.. * @DEAUTH_TX_EVENT: deauth sent. */ enum ieee80211_mlme_event_data { AUTH_EVENT, ASSOC_EVENT, DEAUTH_RX_EVENT, DEAUTH_TX_EVENT, }; /** * enum ieee80211_mlme_event_status - relevant when event type is %MLME_EVENT * @MLME_SUCCESS: the MLME operation completed successfully. * @MLME_DENIED: the MLME operation was denied by the peer. * @MLME_TIMEOUT: the MLME operation timed out. */ enum ieee80211_mlme_event_status { MLME_SUCCESS, MLME_DENIED, MLME_TIMEOUT, }; /** * struct ieee80211_mlme_event - data attached to an %MLME_EVENT * @data: See &enum ieee80211_mlme_event_data * @status: See &enum ieee80211_mlme_event_status * @reason: the reason code if applicable */ struct ieee80211_mlme_event { enum ieee80211_mlme_event_data data; enum ieee80211_mlme_event_status status; u16 reason; }; /** * struct ieee80211_ba_event - data attached for BlockAck related events * @sta: pointer to the &ieee80211_sta to which this event relates * @tid: the tid * @ssn: the starting sequence number (for %BAR_RX_EVENT) */ struct ieee80211_ba_event { struct ieee80211_sta *sta; u16 tid; u16 ssn; }; /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. * @u.rssi: relevant if &type is %RSSI_EVENT * @u.mlme: relevant if &type is %AUTH_EVENT * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { enum ieee80211_event_type type; union { struct ieee80211_rssi_event rssi; struct ieee80211_mlme_event mlme; struct ieee80211_ba_event ba; } u; }; /** * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data * * This structure describes the group id data of VHT MU-MIMO * * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group */ struct ieee80211_mu_group_data { u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; }; /** * struct ieee80211_ftm_responder_params - FTM responder parameters * * @lci: LCI subelement content * @civicloc: CIVIC location subelement content * @lci_len: LCI data length * @civicloc_len: Civic data length */ struct ieee80211_ftm_responder_params { const u8 *lci; const u8 *civicloc; size_t lci_len; size_t civicloc_len; }; /** * struct ieee80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) */ struct ieee80211_fils_discovery { u32 min_interval; u32 max_interval; }; /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * * @vif: reference to owning VIF * @bss: the cfg80211 bss descriptor. Valid only for a station, and only * when associated. Note: This contains information which is not * necessarily authenticated. For example, information coming from probe * responses. * @addr: (link) address used locally * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE * @twt_requester: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_protected: does this BSS support protected TWT frames * @twt_broadcast: does this BSS support broadcast TWT * @use_cts_prot: use CTS protection * @use_short_preamble: use 802.11b short preamble * @use_short_slot: use short slot time (only relevant for ERP) * @dtim_period: num of beacons before the next DTIM, for beaconing, * valid in station mode only if after the driver was notified * with the %BSS_CHANGED_BEACON_INFO flag, will be non-zero then. * @sync_tsf: last beacon's/probe response's TSF timestamp (could be old * as it may have been received during scanning long ago). If the * HW flag %IEEE80211_HW_TIMING_BEACON_ONLY is set, then this can * only come from a beacon, but might not become valid until after * association when a beacon is received (which is notified with the * %BSS_CHANGED_DTIM flag.). See also sync_dtim_count important notice. * @sync_device_ts: the device timestamp corresponding to the sync_tsf, * the driver/device can use this to calculate synchronisation * (see @sync_tsf). See also sync_dtim_count important notice. * @sync_dtim_count: Only valid when %IEEE80211_HW_TIMING_BEACON_ONLY * is requested, see @sync_tsf/@sync_device_ts. * IMPORTANT: These three sync_* parameters would possibly be out of sync * by the time the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. * Note also that this is not used with MLD associations, mac80211 doesn't * know how to track beacons for all of the links for this. * @beacon_int: beacon interval * @assoc_capability: capabilities taken from assoc resp * @basic_rates: bitmap of basic rates, each bit stands for an * index into the rate table configured by the driver in * the current band. * @beacon_rate: associated AP's beacon TX rate * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. * This field is only valid when the channel is a wide HT/VHT channel. * Note that with TDLS this can be the case (channel is HT, protection must * be used from this field) even when the BSS association isn't using HT. * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled. As with the cfg80211 callback, a change here should * cause an event to be sent indicating where the current value is in * relation to the newly configured threshold. * @cqm_rssi_low: Connection quality monitor RSSI lower threshold, a zero value * implies disabled. This is an alternative mechanism to the single * threshold event and can't be enabled simultaneously with it. * @cqm_rssi_high: Connection quality monitor RSSI upper threshold. * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis * @qos: This is a QoS-enabled BSS. * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode. * @txpower: TX power in dBm. INT_MIN means not configured. * @txpower_type: TX power adjustment used to control per packet Transmit * Power Control (TPC) in lower driver for the current vif. In particular * TPC is enabled if value passed in %txpower_type is * NL80211_TX_POWER_LIMITED (allow using less than specified from * userspace), whereas TPC is disabled if %txpower_type is set to * NL80211_TX_POWER_FIXED (use value configured from userspace) * @p2p_noa_attr: P2P NoA attribute for P2P powersave * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed * to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS * if it has associated clients without P2P PS support. * @max_idle_period: the time period during which the station can refrain from * transmitting frames to its associated AP without being disassociated. * In units of 1000 TUs. Zero value indicates that the AP did not include * a (valid) BSS Max Idle Period Element. * @protected_keep_alive: if set, indicates that the station should send an RSN * protected frame to the AP to reset the idle timer at the AP for the * station. * @ftm_responder: whether to enable or disable fine timing measurement FTM * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set * @ema_ap: AP supports enhancements of discovery and advertisement of * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. * @he_oper: HE operation information of the BSS (AP/Mesh) or of the AP we are * connected to (STA) * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE * @fils_discovery: FILS discovery configuration * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed * to driver when rate control is offloaded to firmware. * @power_type: power type of BSS for 6 GHz * @tx_pwr_env: transmit power envelope array of BSS. * @tx_pwr_env_num: number of @tx_pwr_env. * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL * when it is not assigned. This pointer is RCU-protected due to the TX * path needing to access it; even though the netdev carrier will always * be off when it is %NULL there can still be races and packets could be * processed after it switches back to %NULL. * @color_change_active: marks whether a color change is ongoing. * @color_change_color: the bss color that will be used after the change. * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability. * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability. * @he_ldpc: in AP mode, indicates interface has HE LDPC capability. * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU * beamformer * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU * beamformee * @vht_mu_beamformer: in AP mode, does this BSS support operation as an VHT MU * beamformer * @vht_mu_beamformee: in AP mode, does this BSS support operation as an VHT MU * beamformee * @he_su_beamformer: in AP-mode, does this BSS support operation as an HE SU * beamformer * @he_su_beamformee: in AP-mode, does this BSS support operation as an HE SU * beamformee * @he_mu_beamformer: in AP-mode, does this BSS support operation as an HE MU * beamformer * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission * (non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU * bandwidth * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU * beamformer * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU * beamformee * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU * beamformer */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; struct cfg80211_bss *bss; const u8 *bssid; unsigned int link_id; u8 addr[ETH_ALEN] __aligned(2); u8 htc_trig_based_pkt_ext; bool uora_exists; u8 uora_ocw_range; u16 frame_time_rts_th; bool he_support; bool twt_requester; bool twt_responder; bool twt_protected; bool twt_broadcast; /* erp related data */ bool use_cts_prot; bool use_short_preamble; bool use_short_slot; bool enable_beacon; u8 dtim_period; u16 beacon_int; u16 assoc_capability; u64 sync_tsf; u32 sync_device_ts; u8 sync_dtim_count; u32 basic_rates; struct ieee80211_rate *beacon_rate; int mcast_rate[NUM_NL80211_BANDS]; u16 ht_operation_mode; s32 cqm_rssi_thold; u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; bool qos; bool hidden_ssid; int txpower; enum nl80211_tx_power_setting txpower_type; struct ieee80211_p2p_noa_attr p2p_noa_attr; bool allow_p2p_go_ps; u16 max_idle_period; bool protected_keep_alive; bool ftm_responder; struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; struct { u32 params; u16 nss_set; } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; struct cfg80211_bitrate_mask beacon_tx_rate; enum ieee80211_ap_reg_power power_type; struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; u8 tx_pwr_env_num; u8 pwr_reduction; bool eht_support; bool csa_active; bool mu_mimo_owner; struct ieee80211_chanctx_conf __rcu *chanctx_conf; bool color_change_active; u8 color_change_color; bool ht_ldpc; bool vht_ldpc; bool he_ldpc; bool vht_su_beamformer; bool vht_su_beamformee; bool vht_mu_beamformer; bool vht_mu_beamformee; bool he_su_beamformer; bool he_su_beamformee; bool he_mu_beamformer; bool he_full_ul_mumimo; bool eht_su_beamformer; bool eht_su_beamformee; bool eht_mu_beamformer; }; /** * enum mac80211_tx_info_flags - flags to describe transmission information/status * * These flags are used with the @flags member of &ieee80211_tx_info. * * @IEEE80211_TX_CTL_REQ_TX_STATUS: require TX status callback for this frame. * @IEEE80211_TX_CTL_ASSIGN_SEQ: The driver has to assign a sequence * number to this frame, taking care of not overwriting the fragment * number and increasing the sequence number only when the * IEEE80211_TX_CTL_FIRST_FRAGMENT flag is set. mac80211 will properly * assign sequence numbers to QoS-data frames but cannot do so correctly * for non-QoS-data and management frames because beacons need them from * that counter as well and mac80211 cannot guarantee proper sequencing. * If this flag is set, the driver should instruct the hardware to * assign a sequence number to the frame or assign one itself. Cf. IEEE * 802.11-2007 7.1.3.4.1 paragraph 3. This flag will always be set for * beacons and always be clear for frames without a sequence number field. * @IEEE80211_TX_CTL_NO_ACK: tell the low level not to wait for an ack * @IEEE80211_TX_CTL_CLEAR_PS_FILT: clear powersave filter for destination * station * @IEEE80211_TX_CTL_FIRST_FRAGMENT: this is a first fragment of the frame * @IEEE80211_TX_CTL_SEND_AFTER_DTIM: send this frame after DTIM beacon * @IEEE80211_TX_CTL_AMPDU: this frame should be sent as part of an A-MPDU * @IEEE80211_TX_CTL_INJECTED: Frame was injected, internal to mac80211. * @IEEE80211_TX_STAT_TX_FILTERED: The frame was not transmitted * because the destination STA was in powersave mode. Note that to * avoid race conditions, the filter must be set by the hardware or * firmware upon receiving a frame that indicates that the station * went to sleep (must be done on device to filter frames already on * the queue) and may only be unset after mac80211 gives the OK for * that by setting the IEEE80211_TX_CTL_CLEAR_PS_FILT (see above), * since only then is it guaranteed that no more frames are in the * hardware queue. * @IEEE80211_TX_STAT_ACK: Frame was acknowledged * @IEEE80211_TX_STAT_AMPDU: The frame was aggregated, so status * is for the whole aggregation. * @IEEE80211_TX_STAT_AMPDU_NO_BACK: no block ack was returned, * so consider using block ack request (BAR). * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be * set by rate control algorithms to indicate probe rate, will * be cleared for fragmented frames (except on the last fragment) * @IEEE80211_TX_INTFL_OFFCHAN_TX_OK: Internal to mac80211. Used to indicate * that a frame can be transmitted while the queues are stopped for * off-channel operation. * @IEEE80211_TX_CTL_HW_80211_ENCAP: This frame uses hardware encapsulation * (header conversion) * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211, * used to indicate that a frame was already retried due to PS * @IEEE80211_TX_INTFL_DONT_ENCRYPT: completely internal to mac80211, * used to indicate frame should not be encrypted * @IEEE80211_TX_CTL_NO_PS_BUFFER: This frame is a response to a poll * frame (PS-Poll or uAPSD) or a non-bufferable MMPDU and must * be sent although the station is in powersave mode. * @IEEE80211_TX_CTL_MORE_FRAMES: More frames will be passed to the * transmit function after the current frame, this can be used * by drivers to kick the DMA queue only if unset or when the * queue gets full. * @IEEE80211_TX_INTFL_RETRANSMISSION: This frame is being retransmitted * after TX status because the destination was asleep, it must not * be modified again (no seqno assignment, crypto, etc.) * @IEEE80211_TX_INTFL_MLME_CONN_TX: This frame was transmitted by the MLME * code for connection establishment, this indicates that its status * should kick the MLME state machine. * @IEEE80211_TX_INTFL_NL80211_FRAME_TX: Frame was requested through nl80211 * MLME command (internal to mac80211 to figure out whether to send TX * status to user space) * @IEEE80211_TX_CTL_LDPC: tells the driver to use LDPC for this frame * @IEEE80211_TX_CTL_STBC: Enables Space-Time Block Coding (STBC) for this * frame and selects the maximum number of streams that it can use. * @IEEE80211_TX_CTL_TX_OFFCHAN: Marks this packet to be transmitted on * the off-channel channel when a remain-on-channel offload is done * in hardware -- normal packets still flow and are expected to be * handled properly by the device. * @IEEE80211_TX_INTFL_TKIP_MIC_FAILURE: Marks this packet to be used for TKIP * testing. It will be sent out with incorrect Michael MIC key to allow * TKIP countermeasures to be tested. * @IEEE80211_TX_CTL_NO_CCK_RATE: This frame will be sent at non CCK rate. * This flag is actually used for management frame especially for P2P * frames not being sent at CCK rate in 2GHz band. * @IEEE80211_TX_STATUS_EOSP: This packet marks the end of service period, * when its status is reported the service period ends. For frames in * an SP that mac80211 transmits, it is already set; for driver frames * the driver may set this flag. It is also used to do the same for * PS-Poll responses. * @IEEE80211_TX_CTL_USE_MINRATE: This frame will be sent at lowest rate. * This flag is used to send nullfunc frame at minimum rate when * the nullfunc is used for connection monitoring purpose. * @IEEE80211_TX_CTL_DONTFRAG: Don't fragment this packet even if it * would be fragmented by size (this is optional, only used for * monitor injection). * @IEEE80211_TX_STAT_NOACK_TRANSMITTED: A frame that was marked with * IEEE80211_TX_CTL_NO_ACK has been successfully transmitted without * any errors (like issues specific to the driver/HW). * This flag must not be set for frames that don't request no-ack * behaviour with IEEE80211_TX_CTL_NO_ACK. * * Note: If you have to add new flags to the enumeration, then don't * forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary. */ enum mac80211_tx_info_flags { IEEE80211_TX_CTL_REQ_TX_STATUS = BIT(0), IEEE80211_TX_CTL_ASSIGN_SEQ = BIT(1), IEEE80211_TX_CTL_NO_ACK = BIT(2), IEEE80211_TX_CTL_CLEAR_PS_FILT = BIT(3), IEEE80211_TX_CTL_FIRST_FRAGMENT = BIT(4), IEEE80211_TX_CTL_SEND_AFTER_DTIM = BIT(5), IEEE80211_TX_CTL_AMPDU = BIT(6), IEEE80211_TX_CTL_INJECTED = BIT(7), IEEE80211_TX_STAT_TX_FILTERED = BIT(8), IEEE80211_TX_STAT_ACK = BIT(9), IEEE80211_TX_STAT_AMPDU = BIT(10), IEEE80211_TX_STAT_AMPDU_NO_BACK = BIT(11), IEEE80211_TX_CTL_RATE_CTRL_PROBE = BIT(12), IEEE80211_TX_INTFL_OFFCHAN_TX_OK = BIT(13), IEEE80211_TX_CTL_HW_80211_ENCAP = BIT(14), IEEE80211_TX_INTFL_RETRIED = BIT(15), IEEE80211_TX_INTFL_DONT_ENCRYPT = BIT(16), IEEE80211_TX_CTL_NO_PS_BUFFER = BIT(17), IEEE80211_TX_CTL_MORE_FRAMES = BIT(18), IEEE80211_TX_INTFL_RETRANSMISSION = BIT(19), IEEE80211_TX_INTFL_MLME_CONN_TX = BIT(20), IEEE80211_TX_INTFL_NL80211_FRAME_TX = BIT(21), IEEE80211_TX_CTL_LDPC = BIT(22), IEEE80211_TX_CTL_STBC = BIT(23) | BIT(24), IEEE80211_TX_CTL_TX_OFFCHAN = BIT(25), IEEE80211_TX_INTFL_TKIP_MIC_FAILURE = BIT(26), IEEE80211_TX_CTL_NO_CCK_RATE = BIT(27), IEEE80211_TX_STATUS_EOSP = BIT(28), IEEE80211_TX_CTL_USE_MINRATE = BIT(29), IEEE80211_TX_CTL_DONTFRAG = BIT(30), IEEE80211_TX_STAT_NOACK_TRANSMITTED = BIT(31), }; #define IEEE80211_TX_CTL_STBC_SHIFT 23 #define IEEE80211_TX_RC_S1G_MCS IEEE80211_TX_RC_VHT_MCS /** * enum mac80211_tx_control_flags - flags to describe transmit control * * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control * protocol frame (e.g. EAP) * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll * frame (PS-Poll or uAPSD). * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup * @IEEE80211_TX_INTCFL_NEED_TXPROCESSING: completely internal to mac80211, * used to indicate that a pending frame requires TX processing before * it can be sent out. * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that * has already been assigned to this frame. * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered * relative to other frames that have this flag set, independent * of their QoS TID or other priority field values. * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally * for sequence number assignment * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this * frame should be transmitted on the specific link. This really is * only relevant for frames that do not have data present, and is * also not used for 802.3 format frames. Note that even if the frame * is on a specific link, address translation might still apply if * it's intended for an MLD. * * These flags are used in tx_info->control.flags. */ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), IEEE80211_TX_INTCFL_NEED_TXPROCESSING = BIT(6), IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, }; #define IEEE80211_LINK_UNSPECIFIED 0xf #define IEEE80211_TX_CTRL_MLO_LINK_UNSPEC \ u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, \ IEEE80211_TX_CTRL_MLO_LINK) /** * enum mac80211_tx_status_flags - flags to describe transmit status * * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid * * These flags are used in tx_info->status.flags. */ enum mac80211_tx_status_flags { IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), }; /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. */ #define IEEE80211_TX_TEMPORARY_FLAGS (IEEE80211_TX_CTL_NO_ACK | \ IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT | \ IEEE80211_TX_CTL_SEND_AFTER_DTIM | IEEE80211_TX_CTL_AMPDU | \ IEEE80211_TX_STAT_TX_FILTERED | IEEE80211_TX_STAT_ACK | \ IEEE80211_TX_STAT_AMPDU | IEEE80211_TX_STAT_AMPDU_NO_BACK | \ IEEE80211_TX_CTL_RATE_CTRL_PROBE | IEEE80211_TX_CTL_NO_PS_BUFFER | \ IEEE80211_TX_CTL_MORE_FRAMES | IEEE80211_TX_CTL_LDPC | \ IEEE80211_TX_CTL_STBC | IEEE80211_TX_STATUS_EOSP) /** * enum mac80211_rate_control_flags - per-rate flags set by the * Rate Control algorithm. * * These flags are set by the Rate control algorithm for each rate during tx, * in the @flags member of struct ieee80211_tx_rate. * * @IEEE80211_TX_RC_USE_RTS_CTS: Use RTS/CTS exchange for this rate. * @IEEE80211_TX_RC_USE_CTS_PROTECT: CTS-to-self protection is required. * This is set if the current BSS requires ERP protection. * @IEEE80211_TX_RC_USE_SHORT_PREAMBLE: Use short preamble. * @IEEE80211_TX_RC_MCS: HT rate. * @IEEE80211_TX_RC_VHT_MCS: VHT MCS rate, in this case the idx field is split * into a higher 4 bits (Nss) and lower 4 bits (MCS number) * @IEEE80211_TX_RC_GREEN_FIELD: Indicates whether this rate should be used in * Greenfield mode. * @IEEE80211_TX_RC_40_MHZ_WIDTH: Indicates if the Channel Width should be 40 MHz. * @IEEE80211_TX_RC_80_MHZ_WIDTH: Indicates 80 MHz transmission * @IEEE80211_TX_RC_160_MHZ_WIDTH: Indicates 160 MHz transmission * (80+80 isn't supported yet) * @IEEE80211_TX_RC_DUP_DATA: The frame should be transmitted on both of the * adjacent 20 MHz channels, if the current channel type is * NL80211_CHAN_HT40MINUS or NL80211_CHAN_HT40PLUS. * @IEEE80211_TX_RC_SHORT_GI: Short Guard interval should be used for this rate. */ enum mac80211_rate_control_flags { IEEE80211_TX_RC_USE_RTS_CTS = BIT(0), IEEE80211_TX_RC_USE_CTS_PROTECT = BIT(1), IEEE80211_TX_RC_USE_SHORT_PREAMBLE = BIT(2), /* rate index is an HT/VHT MCS instead of an index */ IEEE80211_TX_RC_MCS = BIT(3), IEEE80211_TX_RC_GREEN_FIELD = BIT(4), IEEE80211_TX_RC_40_MHZ_WIDTH = BIT(5), IEEE80211_TX_RC_DUP_DATA = BIT(6), IEEE80211_TX_RC_SHORT_GI = BIT(7), IEEE80211_TX_RC_VHT_MCS = BIT(8), IEEE80211_TX_RC_80_MHZ_WIDTH = BIT(9), IEEE80211_TX_RC_160_MHZ_WIDTH = BIT(10), }; /* there are 40 bytes if you don't need the rateset to be kept */ #define IEEE80211_TX_INFO_DRIVER_DATA_SIZE 40 /* if you do need the rateset, then you have less space */ #define IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE 24 /* maximum number of rate stages */ #define IEEE80211_TX_MAX_RATES 4 /* maximum number of rate table entries */ #define IEEE80211_TX_RATE_TABLE_SIZE 4 /** * struct ieee80211_tx_rate - rate selection/status * * @idx: rate index to attempt to send with * @flags: rate control flags (&enum mac80211_rate_control_flags) * @count: number of tries in this rate before going to the next rate * * A value of -1 for @idx indicates an invalid rate and, if used * in an array of retry rates, that no more rates should be tried. * * When used for transmit status reporting, the driver should * always report the rate along with the flags it used. * * &struct ieee80211_tx_info contains an array of these structs * in the control information, and it will be filled by the rate * control algorithm according to what should be sent. For example, * if this array contains, in the format { <idx>, <count> } the * information:: * * { 3, 2 }, { 2, 2 }, { 1, 4 }, { -1, 0 }, { -1, 0 } * * then this means that the frame should be transmitted * up to twice at rate 3, up to twice at rate 2, and up to four * times at rate 1 if it doesn't get acknowledged. Say it gets * acknowledged by the peer after the fifth attempt, the status * information should then contain:: * * { 3, 2 }, { 2, 2 }, { 1, 1 }, { -1, 0 } ... * * since it was transmitted twice at rate 3, twice at rate 2 * and once at rate 1 after which we received an acknowledgement. */ struct ieee80211_tx_rate { s8 idx; u16 count:5, flags:11; } __packed; #define IEEE80211_MAX_TX_RETRY 31 static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate) { return rate->idx >= 0 && rate->count > 0; } static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate, u8 mcs, u8 nss) { WARN_ON(mcs & ~0xF); WARN_ON((nss - 1) & ~0x7); rate->idx = ((nss - 1) << 4) | mcs; } static inline u8 ieee80211_rate_get_vht_mcs(const struct ieee80211_tx_rate *rate) { return rate->idx & 0xF; } static inline u8 ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) { return (rate->idx >> 4) + 1; } /** * struct ieee80211_tx_info - skb transmit information * * This structure is placed in skb->cb for three uses: * (1) mac80211 TX control - mac80211 tells the driver what to do * (2) driver internal use (if applicable) * (3) TX status information - driver tells mac80211 what happened * * @flags: transmit info flags, defined above * @band: the band to transmit on (use e.g. for checking for races), * not valid if the interface is an MLD since we won't know which * link the frame will be transmitted on * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @status_data: internal data for TX status handling, assigned privately, * see also &enum ieee80211_status_data for the internal documentation * @status_data_idr: indicates status data is IDR allocated ID for ack frame * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try * @control.rts_cts_rate_idx: rate for RTS or CTS * @control.use_rts: use RTS * @control.use_cts_prot: use RTS/CTS * @control.short_preamble: use short preamble (CCK only) * @control.skip_table: skip externally configured rate table * @control.jiffies: timestamp for expiry on powersave clients * @control.vif: virtual interface (may be NULL) * @control.hw_key: key to encrypt with (may be NULL) * @control.flags: control flags, see &enum mac80211_tx_control_flags * @control.enqueue_time: enqueue time (for iTXQs) * @driver_rates: alias to @control.rates to reserve space * @pad: padding * @rate_driver_data: driver use area if driver needs @control.rates * @status: union part for status data * @status.rates: attempted rates * @status.ack_signal: ACK signal * @status.ampdu_ack_len: AMPDU ack length * @status.ampdu_len: AMPDU length * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers */ struct ieee80211_tx_info { /* common information */ u32 flags; u32 band:3, status_data_idr:1, status_data:13, hw_queue:4, tx_time_est:10; /* 1 free bit */ union { struct { union { /* rate control */ struct { struct ieee80211_tx_rate rates[ IEEE80211_TX_MAX_RATES]; s8 rts_cts_rate_idx; u8 use_rts:1; u8 use_cts_prot:1; u8 short_preamble:1; u8 skip_table:1; /* for injection only (bitmap) */ u8 antennas:2; /* 14 bits free */ }; /* only needed before rate control */ unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; u32 flags; codel_time_t enqueue_time; } control; struct { u64 cookie; } ack; struct { struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; s32 ack_signal; u8 ampdu_ack_len; u8 ampdu_len; u8 antenna; u8 pad; u16 tx_time; u8 flags; u8 pad2; void *status_driver_data[16 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ IEEE80211_TX_MAX_RATES]; u8 pad[4]; void *rate_driver_data[ IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE / sizeof(void *)]; }; void *driver_data[ IEEE80211_TX_INFO_DRIVER_DATA_SIZE / sizeof(void *)]; }; }; static inline u16 ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est) { /* We only have 10 bits in tx_time_est, so store airtime * in increments of 4us and clamp the maximum to 2**12-1 */ info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2; return info->tx_time_est << 2; } static inline u16 ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) { return info->tx_time_est << 2; } /*** * struct ieee80211_rate_status - mrr stage for status path * * This struct is used in struct ieee80211_tx_status to provide drivers a * dynamic way to report about used rates and power levels per packet. * * @rate_idx The actual used rate. * @try_count How often the rate was tried. * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the * corresponding wifi hardware. The idx shall point to the power level * that was used when sending the packet. */ struct ieee80211_rate_status { struct rate_info rate_idx; u8 try_count; u8 tx_power_idx; }; /** * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) * @rates: Mrr stages that were used when sending the packet * @n_rates: Number of mrr stages (count of instances for @rates) * @free_list: list where processed skbs are stored to be free'd by the driver * @ack_hwtstamp: Hardware timestamp of the received ack in nanoseconds * Only needed for Timing measurement and Fine timing measurement action * frames. Only reported by devices that have timestamping enabled. */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; struct ieee80211_rate_status *rates; ktime_t ack_hwtstamp; u8 n_rates; struct list_head *free_list; }; /** * struct ieee80211_scan_ies - descriptors for different blocks of IEs * * This structure is used to point to different blocks of IEs in HW scan * and scheduled scan. These blocks contain the IEs passed by userspace * and the ones generated by mac80211. * * @ies: pointers to band specific IEs. * @len: lengths of band_specific IEs. * @common_ies: IEs for all bands (especially vendor specific ones) * @common_ie_len: length of the common_ies */ struct ieee80211_scan_ies { const u8 *ies[NUM_NL80211_BANDS]; size_t len[NUM_NL80211_BANDS]; const u8 *common_ies; size_t common_ie_len; }; static inline struct ieee80211_tx_info *IEEE80211_SKB_CB(struct sk_buff *skb) { return (struct ieee80211_tx_info *)skb->cb; } static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb) { return (struct ieee80211_rx_status *)skb->cb; } /** * ieee80211_tx_info_clear_status - clear TX status * * @info: The &struct ieee80211_tx_info to be cleared. * * When the driver passes an skb back to mac80211, it must report * a number of things in TX status. This function clears everything * in the TX status but the rate control information (it does clear * the count since you need to fill that in anyway). * * NOTE: While the rates array is kept intact, this will wipe all of the * driver_data fields in info, so it's up to the driver to restore * any fields it needs after calling this helper. */ static inline void ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) { int i; BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, control.rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, driver_rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != 8); /* clear the rate counts */ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) info->status.rates[i].count = 0; memset_after(&info->status, 0, rates); } /** * enum mac80211_rx_flags - receive flags * * These flags are used with the @flag member of &struct ieee80211_rx_status. * @RX_FLAG_MMIC_ERROR: Michael MIC error was reported on this frame. * Use together with %RX_FLAG_MMIC_STRIPPED. * @RX_FLAG_DECRYPTED: This frame was decrypted in hardware. * @RX_FLAG_MMIC_STRIPPED: the Michael MIC is stripped off this frame, * verification has been done by the hardware. * @RX_FLAG_IV_STRIPPED: The IV and ICV are stripped from this frame. * If this flag is set, the stack cannot do any replay detection * hence the driver or hardware will have to do that. * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this * flag indicates that the PN was verified for replay protection. * Note that this flag is also currently only supported when a frame * is also decrypted (ie. @RX_FLAG_DECRYPTED must be set) * @RX_FLAG_DUP_VALIDATED: The driver should set this flag if it did * de-duplication by itself. * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime * field) is valid if this field is non-zero, and the position * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS * merging. * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the last symbol of the MPDU * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime * is only for use in the radiotap timestamp header, not otherwise a valid * @mactime value. Note this is a separate flag so that we continue to see * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all * subframes of a single A-MPDU * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without * processing it in any regular way. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except * monitor interfaces. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_AMSDU_MORE: Some drivers may prefer to report separate A-MSDU * subframes instead of a one huge frame for performance reasons. * All, but the last MSDU from an A-MSDU should have this flag set. E.g. * if an A-MSDU has 3 frames, the first 2 must have the flag set, while * the 3rd (last) one must not have this flag set. The flag is used to * deal with retransmission/duplication recovery properly since A-MSDU * subframes share the same sequence number. Reported subframes can be * either regular MSDU or singly A-MSDUs. Subframes must not be * interleaved with other frames. * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the * skb->data (before the 802.11 header). * If used, the SKB's mac_header pointer must be set to point * to the 802.11 header after the TLVs, and any padding added after TLV * data to align to 4 must be cleared by the driver putting the TLVs * in the skb. * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before. * This is used for AMSDU subframes which can have the same PN as * the first subframe. * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must * be done in the hardware. * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this * frame * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present * (&struct ieee80211_radiotap_he, mac80211 will fill in * * - DATA3_DATA_MCS * - DATA3_DATA_DCM * - DATA3_CODING * - DATA5_GI * - DATA5_DATA_BW_RU_ALLOC * - DATA6_NSTS * - DATA3_STBC * * from the RX info data, so leave those zeroed when building this data) * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present * (&struct ieee80211_radiotap_he_mu) * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present * @RX_FLAG_NO_PSDU: use the frame only for radiotap reporting, with * the "0-length PSDU" field included there. The value for it is * in &struct ieee80211_rx_status. Note that if this value isn't * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), RX_FLAG_DUP_VALIDATED = BIT(11), RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), RX_FLAG_MIC_STRIPPED = BIT(21), RX_FLAG_ALLOW_SAME_PN = BIT(22), RX_FLAG_ICV_STRIPPED = BIT(23), RX_FLAG_AMPDU_EOF_BIT = BIT(24), RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), RX_FLAG_RADIOTAP_HE = BIT(26), RX_FLAG_RADIOTAP_HE_MU = BIT(27), RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), }; /** * enum mac80211_rx_encoding_flags - MCS & bandwidth flags * * @RX_ENC_FLAG_SHORTPRE: Short preamble was used for this frame * @RX_ENC_FLAG_SHORT_GI: Short guard interval was used * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, * if the driver fills this value it should add * %IEEE80211_RADIOTAP_MCS_HAVE_FMT * to @hw.radiotap_mcs_details to advertise that fact. * @RX_ENC_FLAG_LDPC: LDPC was used * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_ENC_FLAG_BF: packet was beamformed */ enum mac80211_rx_encoding_flags { RX_ENC_FLAG_SHORTPRE = BIT(0), RX_ENC_FLAG_SHORT_GI = BIT(2), RX_ENC_FLAG_HT_GF = BIT(3), RX_ENC_FLAG_STBC_MASK = BIT(4) | BIT(5), RX_ENC_FLAG_LDPC = BIT(6), RX_ENC_FLAG_BF = BIT(7), }; #define RX_ENC_FLAG_STBC_SHIFT 4 enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, RX_ENC_VHT, RX_ENC_HE, RX_ENC_EHT, }; /** * struct ieee80211_rx_status - receive status * * The low-level driver should provide this information (the subset * supported by hardware) to the 802.11 code with each received * frame, in the skb's control buffer (cb). * * @mactime: value in microseconds of the 64-bit Time Synchronization Function * (TSF) timer when the first data symbol (MPDU) arrived at the hardware. * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is * needed only for beacons and probe responses that update the scan cache. * @ack_tx_hwtstamp: Hardware timestamp for the ack TX in nanoseconds. Only * needed for Timing measurement and Fine timing measurement action frames. * Only reported by devices that have timestamping enabled. * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received * @freq: frequency the radio was tuned to when receiving this frame, in MHz * This field must be set for management frames, but isn't strictly needed * for data (other) frames - for those it only affects radiotap reporting. * @freq_offset: @freq has a positive offset of 500Khz. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* * @chains: bitmask of receive chains for which separate signal strength * values were filled. * @chain_signal: per-chain signal strength, in dBm (unlike @signal, doesn't * support dB or unspecified units) * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @nss: number of streams (VHT, HE and EHT only) * @flag: %RX_FLAG_\* * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags * @he_ru: HE RU, from &enum nl80211_he_ru_alloc * @he_gi: HE GI, from &enum nl80211_he_gi * @he_dcm: HE DCM value * @eht: EHT specific rate information * @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc * @eht.gi: EHT GI, from &enum nl80211_eht_gi * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. * @link_id: id of the link used to receive the packet. This is used along with * @link_valid. */ struct ieee80211_rx_status { u64 mactime; union { u64 boottime_ns; ktime_t ack_tx_hwtstamp; }; u32 device_timestamp; u32 ampdu_reference; u32 flag; u16 freq: 13, freq_offset: 1; u8 enc_flags; u8 encoding:3, bw:4; union { struct { u8 he_ru:3; u8 he_gi:2; u8 he_dcm:1; }; struct { u8 ru:4; u8 gi:2; } eht; }; u8 rate_idx; u8 nss; u8 rx_flags; u8 band; u8 antenna; s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; static inline u32 ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) { return MHZ_TO_KHZ(rx_status->freq) + (rx_status->freq_offset ? 500 : 0); } /** * enum ieee80211_conf_flags - configuration flags * * Flags to define PHY configuration options * * @IEEE80211_CONF_MONITOR: there's a monitor interface present -- use this * to determine for example whether to calculate timestamps for packets * or not, do not use instead of filter flags! * @IEEE80211_CONF_PS: Enable 802.11 power save mode (managed mode only). * This is the power save mode defined by IEEE 802.11-2007 section 11.2, * meaning that the hardware still wakes up for beacons, is able to * transmit frames and receive the possible acknowledgment frames. * Not to be confused with hardware specific wakeup/sleep states, * driver is responsible for that. See the section "Powersave support" * for more. * @IEEE80211_CONF_IDLE: The device is running, but idle; if the flag is set * the driver should be prepared to handle configuration requests but * may turn the device off as much as possible. Typically, this flag will * be set when an interface is set UP but not associated or scanning, but * it can also be unset in that case when monitor interfaces are active. * @IEEE80211_CONF_OFFCHANNEL: The device is currently not on its main * operating channel. */ enum ieee80211_conf_flags { IEEE80211_CONF_MONITOR = (1<<0), IEEE80211_CONF_PS = (1<<1), IEEE80211_CONF_IDLE = (1<<2), IEEE80211_CONF_OFFCHANNEL = (1<<3), }; /** * enum ieee80211_conf_changed - denotes which configuration changed * * @IEEE80211_CONF_CHANGE_LISTEN_INTERVAL: the listen interval changed * @IEEE80211_CONF_CHANGE_MONITOR: the monitor flag changed * @IEEE80211_CONF_CHANGE_PS: the PS flag or dynamic PS timeout changed * @IEEE80211_CONF_CHANGE_POWER: the TX power changed * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ enum ieee80211_conf_changed { IEEE80211_CONF_CHANGE_SMPS = BIT(1), IEEE80211_CONF_CHANGE_LISTEN_INTERVAL = BIT(2), IEEE80211_CONF_CHANGE_MONITOR = BIT(3), IEEE80211_CONF_CHANGE_PS = BIT(4), IEEE80211_CONF_CHANGE_POWER = BIT(5), IEEE80211_CONF_CHANGE_CHANNEL = BIT(6), IEEE80211_CONF_CHANGE_RETRY_LIMITS = BIT(7), IEEE80211_CONF_CHANGE_IDLE = BIT(8), }; /** * enum ieee80211_smps_mode - spatial multiplexing power save mode * * @IEEE80211_SMPS_AUTOMATIC: automatic * @IEEE80211_SMPS_OFF: off * @IEEE80211_SMPS_STATIC: static * @IEEE80211_SMPS_DYNAMIC: dynamic * @IEEE80211_SMPS_NUM_MODES: internal, don't use */ enum ieee80211_smps_mode { IEEE80211_SMPS_AUTOMATIC, IEEE80211_SMPS_OFF, IEEE80211_SMPS_STATIC, IEEE80211_SMPS_DYNAMIC, /* keep last */ IEEE80211_SMPS_NUM_MODES, }; /** * struct ieee80211_conf - configuration of the device * * This struct indicates how the driver shall configure the hardware. * * @flags: configuration flags defined above * * @listen_interval: listen interval in units of beacon interval * @ps_dtim_period: The DTIM period of the AP we're connected to, for use * in power saving. Power saving will not be enabled until a beacon * has been received and the DTIM period is known. * @dynamic_ps_timeout: The dynamic powersave timeout (in ms), see the * powersave documentation below. This variable is valid only when * the CONF_PS flag is set. * * @power_level: requested transmit power (in dBm), backward compatibility * value only that is set to the minimum of all interfaces * * @chandef: the channel definition to tune to * @radar_enabled: whether radar detection is enabled * * @long_frame_max_tx_count: Maximum number of transmissions for a "long" frame * (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11, * but actually means the number of transmissions not the number of retries * @short_frame_max_tx_count: Maximum number of transmissions for a "short" * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the * number of transmissions not the number of retries * * @smps_mode: spatial multiplexing powersave mode; note that * %IEEE80211_SMPS_STATIC is used when the device is not * configured for an HT channel. * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ struct ieee80211_conf { u32 flags; int power_level, dynamic_ps_timeout; u16 listen_interval; u8 ps_dtim_period; u8 long_frame_max_tx_count, short_frame_max_tx_count; struct cfg80211_chan_def chandef; bool radar_enabled; enum ieee80211_smps_mode smps_mode; }; /** * struct ieee80211_channel_switch - holds the channel switch data * * The information provided in this structure is required for channel switch * operation. * * @timestamp: value in microseconds of the 64-bit Time Synchronization * Function (TSF) timer when the frame containing the channel switch * announcement was received. This is simply the rx.mactime parameter * the driver passed into mac80211. * @device_timestamp: arbitrary timestamp for the device, this is the * rx.device_timestamp parameter the driver passed to mac80211. * @block_tx: Indicates whether transmission must be blocked before the * scheduled channel switch, as indicated by the AP. * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in * current channel and the expected time of the first beacon in the new * channel, expressed in TU. * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; u32 device_timestamp; bool block_tx; struct cfg80211_chan_def chandef; u8 count; u8 link_id; u32 delay; }; /** * enum ieee80211_vif_flags - virtual interface flags * * @IEEE80211_VIF_BEACON_FILTER: the device performs beacon filtering * on this virtual interface to avoid unnecessary CPU wakeups * @IEEE80211_VIF_SUPPORTS_CQM_RSSI: the device can do connection quality * monitoring on this virtual interface -- i.e. it can monitor * connection quality related parameters, such as the RSSI level and * provide notifications if configured trigger levels are reached. * @IEEE80211_VIF_SUPPORTS_UAPSD: The device can do U-APSD for this * interface. This flag should be set during interface addition, * but may be set/cleared as late as authentication to an AP. It is * only valid for managed/station mode interfaces. * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes * and send P2P_PS notification to the driver if NOA changed, even * this is not pure P2P vif. * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is * enabled for the interface. * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA * operation on this interface and request a channel context without * the AP definition. Use this e.g. because the device is able to * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), IEEE80211_VIF_EML_ACTIVE = BIT(4), IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), }; /** * enum ieee80211_offload_flags - virtual interface offload flags * * @IEEE80211_OFFLOAD_ENCAP_ENABLED: tx encapsulation offload is enabled * The driver supports sending frames passed as 802.3 frames by mac80211. * It must also support sending 802.11 packets for the same interface. * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload * @IEEE80211_OFFLOAD_DECAP_ENABLED: rx encapsulation offload is enabled * The driver supports passing received 802.11 frames as 802.3 frames to * mac80211. */ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), }; /** * struct ieee80211_vif_cfg - interface configuration * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS or not * @ibss_creator: indicates if a new IBSS network is being created * @ps: power-save mode (STA only). This flag is NOT affected by * offchannel/dynamic_ps operations. * @aid: association ID number, valid only when @assoc is true * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. * @eml_med_sync_delay: Medium Synchronization delay as described in * P802.11be_D4.1 Figure 9-1001i. * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 * Figure 9-1001k * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. * The driver must allow ARP queries targeted for all address listed here * to pass through. An empty list implies no ARP queries need to pass. * @arp_addr_cnt: Number of addresses currently on the list. Note that this * may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list * array size), it's up to the driver what to do in that case. * @ssid: The SSID of the current vif. Valid in AP and IBSS mode. * @ssid_len: Length of SSID given in @ssid. * @s1g: BSS is S1G BSS (affects Association Request format). * @idle: This interface is idle. There's also a global idle flag in the * hardware config which may be more appropriate depending on what * your driver/device needs to do. * @ap_addr: AP MLD address, or BSSID for non-MLO connections * (station mode only) */ struct ieee80211_vif_cfg { /* association related data */ bool assoc, ibss_joined; bool ibss_creator; bool ps; u16 aid; u16 eml_cap; u16 eml_med_sync_delay; u16 mld_capa_op; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; u8 ssid[IEEE80211_MAX_SSID_LEN]; size_t ssid_len; bool s1g; bool idle; u8 ap_addr[ETH_ALEN] __aligned(2); }; #define IEEE80211_TTLM_NUM_TIDS 8 /** * struct ieee80211_neg_ttlm - negotiated TID to link map info * * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for * this TID is not included. * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this * TID is not included. * @valid: info is valid or not. */ struct ieee80211_neg_ttlm { u16 downlink[IEEE80211_TTLM_NUM_TIDS]; u16 uplink[IEEE80211_TTLM_NUM_TIDS]; bool valid; }; /** * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling * @NEG_TTLM_RES_ACCEPT: accept the request * @NEG_TTLM_RES_REJECT: reject the request * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping */ enum ieee80211_neg_ttlm_res { NEG_TTLM_RES_ACCEPT, NEG_TTLM_RES_REJECT, NEG_TTLM_RES_SUGGEST_PREFERRED }; /** * struct ieee80211_vif - per-interface data * * Data in this structure is continually present for driver * use during the life of a virtual interface. * * @type: type of this virtual interface * @cfg: vif configuration, see &struct ieee80211_vif_cfg * @bss_conf: BSS configuration for this interface, either our own * or the BSS we're associated to * @link_conf: in case of MLD, the per-link BSS configuration, * indexed by link ID * @valid_links: bitmap of valid links, or 0 for non-MLO. * @active_links: The bitmap of active links, or 0 for non-MLO. * The driver shouldn't change this directly, but use the * API calls meant for that purpose. * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO. * Must be a subset of valid_links. * @suspended_links: subset of dormant_links representing links that are * suspended. * 0 for non-MLO. * @neg_ttlm: negotiated TID to link mapping info. * see &struct ieee80211_neg_ttlm. * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively * @netdev_features: tx netdev features supported by the hardware for this * vif. mac80211 initializes this to hw->netdev_features, and the driver * can mask out specific tx features. mac80211 will handle software fixup * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field * @offload_flags: hardware offload capabilities/flags for this interface. * These are initialized by mac80211 before calling .add_interface, * .change_interface or .update_vif_offload and updated by the driver * within these ops, based on supported features or runtime change * restrictions. * @hw_queue: hardware queue for each AC * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only * @debugfs_dir: debugfs dentry, can be used by drivers to create own per * interface debug files. Note that it will be NULL for the virtual * monitor interface (if that is requested.) * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211 * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links, active_links, dormant_links, suspended_links; struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); bool p2p; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; struct ieee80211_txq *txq; netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif bool probe_req_reg; bool rx_mcast_action_reg; struct ieee80211_vif *mbssid_tx_vif; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * ieee80211_vif_usable_links - Return the usable links for the vif * @vif: the vif for which the usable links are requested * Return: the usable link bitmap */ static inline u16 ieee80211_vif_usable_links(const struct ieee80211_vif *vif) { return vif->valid_links & ~vif->dormant_links; } /** * ieee80211_vif_is_mld - Returns true iff the vif is an MLD one * @vif: the vif * Return: %true if the vif is an MLD, %false otherwise. */ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) { /* valid_links != 0 indicates this vif is an MLD */ return vif->valid_links != 0; } /** * ieee80211_vif_link_active - check if a given link is active * @vif: the vif * @link_id: the link ID to check * Return: %true if the vif is an MLD and the link is active, or if * the vif is not an MLD and the link ID is 0; %false otherwise. */ static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, unsigned int link_id) { if (!ieee80211_vif_is_mld(vif)) return link_id == 0; return vif->active_links & BIT(link_id); } #define for_each_vif_active_link(vif, link, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ (link = link_conf_dereference_check(vif, link_id))) static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { #ifdef CONFIG_MAC80211_MESH return vif->type == NL80211_IFTYPE_MESH_POINT; #endif return false; } /** * wdev_to_ieee80211_vif - return a vif struct from a wdev * @wdev: the wdev to get the vif for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * * Note that this function may return %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); /** * ieee80211_vif_to_wdev - return a wdev struct from a vif * @vif: the vif to get the wdev for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. * This can also be useful to get the netdev associated to a vif. */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) { return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx); } #define link_conf_dereference_protected(vif, link_id) \ rcu_dereference_protected((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) #define link_conf_dereference_check(vif, link_id) \ rcu_dereference_check((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) /** * enum ieee80211_key_flags - key flags * * These flags are used for communication about keys between the driver * and mac80211, with the @flags parameter of &struct ieee80211_key_conf. * * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the * driver to indicate that it requires IV generation for this * particular key. Setting this flag does not necessarily mean that SKBs * will have sufficient tailroom for ICV or MIC. * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by * the driver for a TKIP key if it requires Michael MIC * generation in software. * @IEEE80211_KEY_FLAG_PAIRWISE: Set by mac80211, this flag indicates * that the key is pairwise rather then a shared key. * @IEEE80211_KEY_FLAG_SW_MGMT_TX: This flag should be set by the driver for a * CCMP/GCMP key if it requires CCMP/GCMP encryption of management frames * (MFP) to be done in software. * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver * if space should be prepared for the IV, but the IV * itself should not be generated. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does * not necessarily mean that SKBs will have sufficient tailroom for ICV or * MIC. * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received * management frames. The flag can help drivers that have a hardware * crypto implementation that doesn't deal with management frames * properly by allowing them to not upload the keys to hardware and * fall back to software crypto. Note that this flag deals only with * RX, if your crypto engine can't deal with TX you can also set the * %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW. * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the * driver for a CCMP/GCMP key to indicate that is requires IV generation * only for management frames (MFP). * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver * for a AES_CMAC key to indicate that it requires sequence number * generation only * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), IEEE80211_KEY_FLAG_GENERATE_IV = BIT(1), IEEE80211_KEY_FLAG_GENERATE_MMIC = BIT(2), IEEE80211_KEY_FLAG_PAIRWISE = BIT(3), IEEE80211_KEY_FLAG_SW_MGMT_TX = BIT(4), IEEE80211_KEY_FLAG_PUT_IV_SPACE = BIT(5), IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** * struct ieee80211_key_conf - key information * * This key information is given by mac80211 to the driver by * the set_key() callback in &struct ieee80211_ops. * * @hw_key_idx: To be set by the driver, this is the key index the driver * wants to be given when a frame is transmitted and needs to be * encrypted in hardware. * @cipher: The key's cipher suite selector. * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. * @keyidx: the key index (0-3) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: * - Temporal Encryption Key (128 bits) * - Temporal Authenticator Tx MIC Key (64 bits) * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type * @link_id: the link ID for MLO, or -1 for non-MLO or pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; u32 cipher; u8 icv_len; u8 iv_len; u8 hw_key_idx; s8 keyidx; u16 flags; s8 link_id; u8 keylen; u8 key[]; }; #define IEEE80211_MAX_PN_LEN 16 #define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff)) #define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff)) /** * struct ieee80211_key_seq - key sequence counter * * @tkip: TKIP data, containing IV32 and IV16 in host byte order * @ccmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_cmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_gmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @gcmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @hw: data for HW-only (e.g. cipher scheme) keys */ struct ieee80211_key_seq { union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[6]; } ccmp; struct { u8 pn[6]; } aes_cmac; struct { u8 pn[6]; } aes_gmac; struct { u8 pn[6]; } gcmp; struct { u8 seq[IEEE80211_MAX_PN_LEN]; u8 seq_len; } hw; }; }; /** * enum set_key_cmd - key command * * Used with the set_key() callback in &struct ieee80211_ops, this * indicates whether a key is being removed or added. * * @SET_KEY: a key is set * @DISABLE_KEY: a key must be disabled */ enum set_key_cmd { SET_KEY, DISABLE_KEY, }; /** * enum ieee80211_sta_state - station state * * @IEEE80211_STA_NOTEXIST: station doesn't exist at all, * this is a special state for add/remove transitions * @IEEE80211_STA_NONE: station exists without special state * @IEEE80211_STA_AUTH: station is authenticated * @IEEE80211_STA_ASSOC: station is associated * @IEEE80211_STA_AUTHORIZED: station is authorized (802.1X) */ enum ieee80211_sta_state { /* NOTE: These need to be ordered correctly! */ IEEE80211_STA_NOTEXIST, IEEE80211_STA_NONE, IEEE80211_STA_AUTH, IEEE80211_STA_ASSOC, IEEE80211_STA_AUTHORIZED, }; /** * enum ieee80211_sta_rx_bandwidth - station RX bandwidth * @IEEE80211_STA_RX_BW_20: station can only receive 20 MHz * @IEEE80211_STA_RX_BW_40: station can receive up to 40 MHz * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. */ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_20 = 0, IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, IEEE80211_STA_RX_BW_320, }; /** * struct ieee80211_sta_rates - station rate selection table * * @rcu_head: RCU head used for freeing the table on update * @rate: transmit rates/flags to be used by default. * Overriding entries per-packet is possible by using cb tx control. */ struct ieee80211_sta_rates { struct rcu_head rcu_head; struct { s8 idx; u8 count; u8 count_cts; u8 count_rts; u16 flags; } rate[IEEE80211_TX_RATE_TABLE_SIZE]; }; /** * struct ieee80211_sta_txpwr - station txpower configuration * * Used to configure txpower for station. * * @power: indicates the tx power, in dBm, to be used when sending data frames * to the STA. * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power * will be less than or equal to specified from userspace, whereas if TPC * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. * NL80211_TX_POWER_FIXED is not a valid configuration option for * per peer TPC. */ struct ieee80211_sta_txpwr { s16 power; enum nl80211_tx_power_setting type; }; /** * struct ieee80211_sta_aggregates - info that is aggregated from active links * * Used for any per-link data that needs to be aggregated and updated in the * main &struct ieee80211_sta when updated or the active links change. * * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. * This field is always valid for packets with a VHT preamble. * For packets with a HT preamble, additional limits apply: * * * If the skb is transmitted as part of a BA agreement, the * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. * * If the skb is not part of a BA agreement, the A-MSDU maximal * size is min(max_amsdu_len, 7935) bytes. * * Both additional HT limits must be enforced by the low level * driver. This is defined by the spec (IEEE 802.11-2012 section * 8.3.2.2 NOTE 2). * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID */ struct ieee80211_sta_aggregates { u16 max_amsdu_len; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; }; /** * struct ieee80211_link_sta - station Link specific info * All link specific info for a STA link for a non MLD STA(single) * or a MLD STA(multiple entries) are stored here. * * @sta: reference to owning STA * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) * @link_id: the link ID for this link STA (0 for deflink) * @smps_mode: current SMPS mode (off, static or dynamic) * @supp_rates: Bitmap of supported rates * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the * station can receive at the moment, changed by operating mode * notifications and capabilities. The value is only valid after * the station moves to associated state. * @txpwr: the station tx power configuration * */ struct ieee80211_link_sta { struct ieee80211_sta *sta; u8 addr[ETH_ALEN]; u8 link_id; enum ieee80211_smps_mode smps_mode; u32 supp_rates[NUM_NL80211_BANDS]; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; struct ieee80211_sta_aggregates agg; u8 rx_nss; enum ieee80211_sta_rx_bandwidth bandwidth; struct ieee80211_sta_txpwr txpwr; }; /** * struct ieee80211_sta - station table entry * * A station table entry represents a station we are possibly * communicating with. Since stations are RCU-managed in * mac80211, any ieee80211_sta pointer you get access to must * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. * This also represents the MLD STA in case of MLO association * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. * @wme: indicates whether the STA supports QoS/WME (if local devices does, * otherwise always false) * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. * @mlo: indicates whether the STA is MLO station. * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) * is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA * the first added link STA will point to deflink. * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, * i.e link[0] all links would be assigned to NULL by default and * would access link information via @deflink or link[0]. For MLO * STA, first link STA being added will point its link pointer to * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { u8 addr[ETH_ALEN]; u16 aid; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; bool mfp; bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; struct ieee80211_sta_aggregates *cur; bool support_p2p_ps; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; u16 valid_links; struct ieee80211_link_sta deflink; struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta); #else static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { return true; } #endif #define link_sta_dereference_protected(sta, link_id) \ rcu_dereference_protected((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define link_sta_dereference_check(sta, link_id) \ rcu_dereference_check((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define for_each_sta_active_link(vif, sta, link_sta, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ ((link_sta) = link_sta_dereference_check(sta, link_id))) /** * enum sta_notify_cmd - sta notify command * * Used with the sta_notify() callback in &struct ieee80211_ops, this * indicates if an associated station made a power state transition. * * @STA_NOTIFY_SLEEP: a station is now sleeping * @STA_NOTIFY_AWAKE: a sleeping station woke up */ enum sta_notify_cmd { STA_NOTIFY_SLEEP, STA_NOTIFY_AWAKE, }; /** * struct ieee80211_tx_control - TX control data * * @sta: station table entry, this sta pointer may be NULL and * it is not allowed to copy the pointer, due to RCU. */ struct ieee80211_tx_control { struct ieee80211_sta *sta; }; /** * struct ieee80211_txq - Software intermediate tx queue * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: station table entry, %NULL for per-vif queue * @tid: the TID for this queue (unused for per-vif queue), * %IEEE80211_NUM_TIDS for non-data (if enabled) * @ac: the AC for this queue * @drv_priv: driver private area, sized by hw->txq_data_size * * The driver can obtain packets from this queue by calling * ieee80211_tx_dequeue(). */ struct ieee80211_txq { struct ieee80211_vif *vif; struct ieee80211_sta *sta; u8 tid; u8 ac; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_hw_flags - hardware flags * * These flags are used to indicate hardware capabilities to * the stack. Generally, flags here should have their meaning * done in a way that the simplest hardware doesn't need setting * any particular flags. There are some exceptions to this rule, * however, so you are advised to review these flags carefully. * * @IEEE80211_HW_HAS_RATE_CONTROL: * The hardware or firmware includes rate control, and cannot be * controlled by the stack. As such, no rate control algorithm * should be instantiated, and the TX rate reported to userspace * will be taken from the TX status instead of the rate control * algorithm. * Note that this requires that the driver implement a number of * callbacks so it has the correct information, it needs to have * the @set_rts_threshold callback and must look at the BSS config * @use_cts_prot for G/N protection, @use_short_slot for slot * timing in 2.4 GHz and @use_short_preamble for preambles for * CCK frames. * * @IEEE80211_HW_RX_INCLUDES_FCS: * Indicates that received frames passed to the stack include * the FCS at the end. * * @IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING: * Some wireless LAN chipsets buffer broadcast/multicast frames * for power saving stations in the hardware/firmware and others * rely on the host system for such buffering. This option is used * to configure the IEEE 802.11 upper layer to buffer broadcast and * multicast frames when there are power saving stations so that * the driver can fetch them with ieee80211_get_buffered_bc(). * * @IEEE80211_HW_SIGNAL_UNSPEC: * Hardware can provide signal values but we don't know its units. We * expect values between 0 and @max_signal. * If possible please provide dB or dBm instead. * * @IEEE80211_HW_SIGNAL_DBM: * Hardware gives signal values in dBm, decibel difference from * one milliwatt. This is the preferred method since it is standardized * between different devices. @max_signal does not need to be set. * * @IEEE80211_HW_SPECTRUM_MGMT: * Hardware supports spectrum management defined in 802.11h * Measurement, Channel Switch, Quieting, TPC * * @IEEE80211_HW_AMPDU_AGGREGATION: * Hardware supports 11n A-MPDU aggregation. * * @IEEE80211_HW_SUPPORTS_PS: * Hardware has power save support (i.e. can go to sleep). * * @IEEE80211_HW_PS_NULLFUNC_STACK: * Hardware requires nullfunc frame handling in stack, implies * stack support for dynamic PS. * * @IEEE80211_HW_SUPPORTS_DYNAMIC_PS: * Hardware has support for dynamic PS. * * @IEEE80211_HW_MFP_CAPABLE: * Hardware supports management frame protection (MFP, IEEE 802.11w). * * @IEEE80211_HW_REPORTS_TX_ACK_STATUS: * Hardware can provide ack status reports of Tx frames to * the stack. * * @IEEE80211_HW_CONNECTION_MONITOR: * The hardware performs its own connection monitoring, including * periodic keep-alives to the AP and probing the AP on beacon loss. * * @IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC: * This device needs to get data from beacon before association (i.e. * dtim_period). * * @IEEE80211_HW_SUPPORTS_PER_STA_GTK: The device's crypto engine supports * per-station GTKs as used by IBSS RSN or during fast transition. If * the device doesn't support per-station GTKs, but can be asked not * to decrypt group addressed frames, then IBSS RSN support is still * possible but software crypto will be used. Advertise the wiphy flag * only in that case. * * @IEEE80211_HW_AP_LINK_PS: When operating in AP mode the device * autonomously manages the PS status of connected stations. When * this flag is set mac80211 will not trigger PS mode for connected * stations based on the PM bit of incoming frames. * Use ieee80211_start_ps()/ieee8021_end_ps() to manually configure * the PS mode of connected stations. * * @IEEE80211_HW_TX_AMPDU_SETUP_IN_HW: The device handles TX A-MPDU session * setup strictly in HW. mac80211 should not attempt to do this in * software. * * @IEEE80211_HW_WANT_MONITOR_VIF: The driver would like to be informed of * a virtual monitor interface when monitor interfaces are the only * active interfaces. * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). * * @IEEE80211_HW_SW_CRYPTO_CONTROL: The driver wants to control which of the * crypto algorithms can be done in software - so don't automatically * try to fall back to it if hardware crypto fails, but do so only if * the driver returns 1. This also forces the driver to advertise its * supported cipher suites. * * @IEEE80211_HW_SUPPORT_FAST_XMIT: The driver/hardware supports fast-xmit, * this currently requires only the ability to calculate the duration * for frames. * * @IEEE80211_HW_QUEUE_CONTROL: The driver wants to control per-interface * queue mapping in order to use different queues (not just one per AC) * for different virtual interfaces. See the doc section on HW queue * control for more details. * * @IEEE80211_HW_SUPPORTS_RC_TABLE: The driver supports using a rate * selection table provided by the rate control algorithm. * * @IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF: Use the P2P Device address for any * P2P Interface. This will be honoured even if more than one interface * is supported. * * @IEEE80211_HW_TIMING_BEACON_ONLY: Use sync timing from beacon frames * only, to allow getting TBTT of a DTIM beacon. * * @IEEE80211_HW_SUPPORTS_HT_CCK_RATES: Hardware supports mixing HT/CCK rates * and can cope with CCK rates in an aggregation session (e.g. by not * using aggregation for such frames.) * * @IEEE80211_HW_CHANCTX_STA_CSA: Support 802.11h based channel-switch (CSA) * for a single active channel while using channel contexts. When support * is not enabled the default action is to disconnect when getting the * CSA frame. * * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload * or tailroom of TX skbs without copying them first. * * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. * * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth * than then BSS bandwidth for a TDLS link on the base channel. * * @IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU: The driver supports receiving A-MSDUs * within A-MPDU. * * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status * for sent beacons. * * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each * station has a unique address, i.e. each station entry can be identified * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. * * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the * reordering buffer internally, guaranteeing mac80211 receives frames in * order and does not need to manage its own reorder buffer or BA session * timeout. * * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX, * which implies using per-CPU station statistics. * * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated * A-MSDU frames. Requires software tx queueing and fast-xmit support. * When not using minstrel/minstrel_ht rate control, the driver must * limit the maximum A-MSDU size based on the current tx rate by setting * max_rc_amsdu_len in struct ieee80211_sta. * * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list * skbs, needed for zero-copy software A-MSDU. * * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event * by ieee80211_report_low_ack() based on its own algorithm. For such * drivers, mac80211 packet loss mechanism will not be triggered and driver * is completely depending on firmware event for station kickout. * * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself. * The stack will not do fragmentation. * The callback for @set_frag_threshold should be set as well. * * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * * @IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP: The driver requires the * mgd_prepare_tx() callback to be called before transmission of a * deauthentication frame in case the association was completed but no * beacon was heard. This is required in multi-channel scenarios, where the * virtual interface might not be given air time for the transmission of * the frame, as it is not synced with the AP/P2P GO yet, and thus the * deauthentication frame might not be transmitted. * * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't * support QoS NDP for AP probing - that's most likely a driver bug. * * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of * course requires the driver to use TXQs to start with. * * @IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW: (Hardware) rate control supports VHT * extended NSS BW (dot11VHTExtendedNSSBWCapable). This flag will be set if * the selected rate control algorithm sets %RATE_CTRL_CAPA_VHT_EXT_NSS_BW * but if the rate control is built-in then it must be set by the driver. * See also the documentation for that flag. * * @IEEE80211_HW_STA_MMPDU_TXQ: use the extra non-TID per-station TXQ for all * MMPDUs on station interfaces. This of course requires the driver to use * TXQs to start with. * * @IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN: Driver does not report accurate A-MPDU * length in tx status information * * @IEEE80211_HW_SUPPORTS_MULTI_BSSID: Hardware supports multi BSSID * * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * * @IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT: The card and driver is only * aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx * A-MPDU sessions active while rekeying with Extended Key ID. * * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation * offload * * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation * offload * * @IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP: Hardware supports concurrent rx * decapsulation offload and passing raw 802.11 frames for monitor iface. * If this is supported, the driver must pass both 802.3 frames for real * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to * the stack. * * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color * collision detection and doesn't need it in software. * * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that * implements MLO, so operation can continue on other links when one * link is switching. * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL, IEEE80211_HW_RX_INCLUDES_FCS, IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING, IEEE80211_HW_SIGNAL_UNSPEC, IEEE80211_HW_SIGNAL_DBM, IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC, IEEE80211_HW_SPECTRUM_MGMT, IEEE80211_HW_AMPDU_AGGREGATION, IEEE80211_HW_SUPPORTS_PS, IEEE80211_HW_PS_NULLFUNC_STACK, IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, IEEE80211_HW_REPORTS_TX_ACK_STATUS, IEEE80211_HW_CONNECTION_MONITOR, IEEE80211_HW_QUEUE_CONTROL, IEEE80211_HW_SUPPORTS_PER_STA_GTK, IEEE80211_HW_AP_LINK_PS, IEEE80211_HW_TX_AMPDU_SETUP_IN_HW, IEEE80211_HW_SUPPORTS_RC_TABLE, IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF, IEEE80211_HW_TIMING_BEACON_ONLY, IEEE80211_HW_SUPPORTS_HT_CCK_RATES, IEEE80211_HW_CHANCTX_STA_CSA, IEEE80211_HW_SUPPORTS_CLONED_SKBS, IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS, IEEE80211_HW_TDLS_WIDER_BW, IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, IEEE80211_HW_SUPPORTS_REORDERING_BUFFER, IEEE80211_HW_USES_RSS, IEEE80211_HW_TX_AMSDU, IEEE80211_HW_TX_FRAG_LIST, IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, IEEE80211_HW_BUFF_MMPDU_TXQ, IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, IEEE80211_HW_STA_MMPDU_TXQ, IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_HANDLES_QUIET_CSA, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS }; /** * struct ieee80211_hw - hardware information and state * * This structure contains the configuration and hardware * information for an 802.11 PHY. * * @wiphy: This points to the &struct wiphy allocated for this * 802.11 PHY. You must fill in the @perm_addr and @dev * members of this structure using SET_IEEE80211_DEV() * and SET_IEEE80211_PERM_ADDR(). Additionally, all supported * bands (with channels, bitrates) are registered here. * * @conf: &struct ieee80211_conf, device configuration, don't use. * * @priv: pointer to private area that was allocated for driver use * along with this structure. * * @flags: hardware flags, see &enum ieee80211_hw_flags. * * @extra_tx_headroom: headroom to reserve in each transmit skb * for use by the driver (e.g. for transmit headers.) * * @extra_beacon_tailroom: tailroom to reserve in each beacon tx skb. * Can be used by drivers to add extra IEs. * * @max_signal: Maximum value for signal (rssi) in RX information, used * only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB * * @max_listen_interval: max listen interval in units of beacon interval * that HW supports * * @queues: number of available hardware transmit queues for * data packets. WMM/QoS requires at least four, these * queues need to have configurable access parameters. * * @rate_control_algorithm: rate control algorithm for this hardware. * If unset (NULL), the default algorithm will be used. Must be * set before calling ieee80211_register_hw(). * * @vif_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_vif. * @sta_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_sta. * @chanctx_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_chanctx_conf. * @txq_data_size: size (in bytes) of the drv_priv data area * within @struct ieee80211_txq. * * @max_rates: maximum number of alternate rate retry stages the hw * can handle. * @max_report_rates: maximum number of alternate rate retry stages * the hw can report back. * @max_rate_tries: maximum number of tries for each stage * * @max_rx_aggregation_subframes: maximum buffer size (number of * sub-frames) to be used for A-MPDU block ack receiver * aggregation. * This is only relevant if the device has restrictions on the * number of subframes, if it relies on mac80211 to do reordering * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an * aggregate an HT/HE device will transmit. In HT AddBA we'll * advertise a constant value of 64 as some older APs crash if * the window size is smaller (an example is LinkSys WRT120N * with FW v1.0.07 build 002 Jun 18 2012). * For AddBA to HE capable peers this value will be used. * * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum * of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list. * * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX * (if %IEEE80211_HW_QUEUE_CONTROL is set) * * @radiotap_mcs_details: lists which MCS information can the HW * reports, by default it is set to _MCS, _GI and _BW but doesn't * include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_\* values, only * adding _BW is supported today. * * @radiotap_vht_details: lists which VHT MCS information the HW reports, * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status * device_timestamp. * @radiotap_timestamp.units_pos: Must be set to a combination of a * IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value. * @radiotap_timestamp.accuracy: If non-negative, fills the accuracy in the * radiotap field and the accuracy known flag will be set. * * @netdev_features: netdev features to be set in each netdev created * from this HW. Note that not all features are usable with mac80211, * other features will be rejected during HW registration. * * @uapsd_queues: This bitmap is included in (re)association frame to indicate * for each access category if it is uAPSD trigger-enabled and delivery- * enabled. Use IEEE80211_WMM_IE_STA_QOSINFO_AC_* to set this bitmap. * Each bit corresponds to different AC. Value '1' in specific bit means * that corresponding AC is both trigger- and delivery-enabled. '0' means * neither enabled. * * @uapsd_max_sp_len: maximum number of total buffered frames the WMM AP may * deliver to a WMM STA during any Service Period triggered by the WMM STA. * Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values. * * @max_nan_de_entries: maximum number of NAN DE functions supported by the * device. * * @tx_sk_pacing_shift: Pacing shift to set on TCP sockets when frames from * them are encountered. The default should typically not be changed, * unless the driver has good reasons for needing more buffers. * * @weight_multiplier: Driver specific airtime weight multiplier used while * refilling deficit of each TXQ. * * @max_mtu: the max mtu could be set. * * @tx_power_levels: a list of power levels supported by the wifi hardware. * The power levels can be specified either as integer or fractions. * The power level at idx 0 shall be the maximum positive power level. * * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list. */ struct ieee80211_hw { struct ieee80211_conf conf; struct wiphy *wiphy; const char *rate_control_algorithm; void *priv; unsigned long flags[BITS_TO_LONGS(NUM_IEEE80211_HW_FLAGS)]; unsigned int extra_tx_headroom; unsigned int extra_beacon_tailroom; int vif_data_size; int sta_data_size; int chanctx_data_size; int txq_data_size; u16 queues; u16 max_listen_interval; s8 max_signal; u8 max_rates; u8 max_report_rates; u8 max_rate_tries; u16 max_rx_aggregation_subframes; u16 max_tx_aggregation_subframes; u8 max_tx_fragments; u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; u16 radiotap_vht_details; struct { int units_pos; s16 accuracy; } radiotap_timestamp; netdev_features_t netdev_features; u8 uapsd_queues; u8 uapsd_max_sp_len; u8 max_nan_de_entries; u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; const s8 *tx_power_levels; u8 max_txpwr_levels_idx; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return test_bit(flg, hw->flags); } #define ieee80211_hw_check(hw, flg) _ieee80211_hw_check(hw, IEEE80211_HW_##flg) static inline void _ieee80211_hw_set(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return __set_bit(flg, hw->flags); } #define ieee80211_hw_set(hw, flg) _ieee80211_hw_set(hw, IEEE80211_HW_##flg) /** * struct ieee80211_scan_request - hw scan request * * @ies: pointers different parts of IEs (in req.ie) * @req: cfg80211 request. */ struct ieee80211_scan_request { struct ieee80211_scan_ies ies; /* Keep last */ struct cfg80211_scan_request req; }; /** * struct ieee80211_tdls_ch_sw_params - TDLS channel switch parameters * * @sta: peer this TDLS channel-switch request/response came from * @chandef: channel referenced in a TDLS channel-switch request * @action_code: see &enum ieee80211_tdls_actioncode * @status: channel-switch response status * @timestamp: time at which the frame was received * @switch_time: switch-timing parameter received in the frame * @switch_timeout: switch-timing parameter received in the frame * @tmpl_skb: TDLS switch-channel response template * @ch_sw_tm_ie: offset of the channel-switch timing IE inside @tmpl_skb */ struct ieee80211_tdls_ch_sw_params { struct ieee80211_sta *sta; struct cfg80211_chan_def *chandef; u8 action_code; u32 status; u32 timestamp; u16 switch_time; u16 switch_timeout; struct sk_buff *tmpl_skb; u32 ch_sw_tm_ie; }; /** * wiphy_to_ieee80211_hw - return a mac80211 driver hw struct from a wiphy * * @wiphy: the &struct wiphy which we want to query * * mac80211 drivers can use this to get to their respective * &struct ieee80211_hw. Drivers wishing to get to their own private * structure can then access it via hw->priv. Note that mac802111 drivers should * not use wiphy_priv() to try to get their private driver structure as this * is already used internally by mac80211. * * Return: The mac80211 driver hw struct of @wiphy. */ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy); /** * SET_IEEE80211_DEV - set device for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the device for * @dev: the &struct device of this 802.11 device */ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev) { set_wiphy_dev(hw->wiphy, dev); } /** * SET_IEEE80211_PERM_ADDR - set the permanent MAC address for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the MAC address for * @addr: the address to set */ static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr) { memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN); } static inline struct ieee80211_rate * ieee80211_get_tx_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } static inline struct ieee80211_rate * ieee80211_get_rts_cts_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (c->control.rts_cts_rate_idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rts_cts_rate_idx]; } static inline struct ieee80211_rate * ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c, int idx) { if (c->control.rates[idx + 1].idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[idx + 1].idx]; } /** * ieee80211_free_txskb - free TX skb * @hw: the hardware * @skb: the skb * * Free a transmit skb. Use this function when some failure * to transmit happened and thus status cannot be reported. */ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * DOC: Hardware crypto acceleration * * mac80211 is capable of taking advantage of many hardware * acceleration designs for encryption and decryption operations. * * The set_key() callback in the &struct ieee80211_ops for a given * device is called to enable hardware acceleration of encryption and * decryption. The callback takes a @sta parameter that will be NULL * for default keys or keys used for transmission only, or point to * the station information for the peer for individual keys. * Multiple transmission keys with the same key index may be used when * VLANs are configured for an access point. * * When transmitting, the TX control data will use the @hw_key_idx * selected by the driver by modifying the &struct ieee80211_key_conf * pointed to by the @key parameter to the set_key() function. * * The set_key() call for the %SET_KEY command should return 0 if * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be * added; if you return 0 then hw_key_idx must be assigned to the * hardware key index. You are free to use the full u8 range. * * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is * set, mac80211 will not automatically fall back to software crypto if * enabling hardware crypto failed. The set_key() call may also return the * value 1 to permit this specific key/algorithm to be done in software. * * When the cmd is %DISABLE_KEY then it must succeed. * * Note that it is permissible to not decrypt a frame even if a key * for it has been uploaded to hardware. The stack will not make any * decision based on whether a key has been uploaded or not but rather * based on the receive flags. * * The &struct ieee80211_key_conf structure pointed to by the @key * parameter is guaranteed to be valid until another call to set_key() * removes it, but it can only be used as a cookie to differentiate * keys. * * In TKIP some HW need to be provided a phase 1 key, for RX decryption * acceleration (i.e. iwlwifi). Those drivers should provide update_tkip_key * handler. * The update_tkip_key() call updates the driver with the new phase 1 key. * This happens every time the iv16 wraps around (every 65536 packets). The * set_key() call will happen only once for each key (unless the AP did * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is * provided by update_tkip_key only. The trigger that makes mac80211 call this * handler is software decryption with wrap around of iv16. * * The set_default_unicast_key() call updates the default WEP key index * configured to the hardware for WEP encryption type. This is required * for devices that support offload of data packets (e.g. ARP responses). * * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: * 1) They do not hand over frames decrypted with the old key to mac80211 once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command encrypted with the new key when also needing @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ /** * DOC: Powersave support * * mac80211 has support for various powersave implementations. * * First, it can support hardware that handles all powersaving by itself; * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware * flag. In that case, it will be told about the desired powersave mode * with the %IEEE80211_CONF_PS flag depending on the association status. * The hardware must take care of sending nullfunc frames when necessary, * i.e. when entering and leaving powersave mode. The hardware is required * to look at the AID in beacons and signal to the AP that it woke up when * it finds traffic directed to it. * * %IEEE80211_CONF_PS flag enabled means that the powersave mode defined in * IEEE 802.11-2007 section 11.2 is enabled. This is not to be confused * with hardware wakeup and sleep states. Driver is responsible for waking * up the hardware before issuing commands to the hardware and putting it * back to sleep at appropriate times. * * When PS is enabled, hardware needs to wakeup for beacons and receive the * buffered multicast/broadcast frames after the beacon. Also it must be * possible to send frames and receive the acknowledment frame. * * Other hardware designs cannot send nullfunc frames by themselves and also * need software support for parsing the TIM bitmap. This is also supported * by mac80211 by combining the %IEEE80211_HW_SUPPORTS_PS and * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still * required to pass up beacons. The hardware is still required to handle * waking up for multicast traffic; if it cannot the driver must handle that * as best as it can; mac80211 is too slow to do that. * * Dynamic powersave is an extension to normal powersave in which the * hardware stays awake for a user-specified period of time after sending a * frame so that reply frames need not be buffered and therefore delayed to * the next wakeup. It's a compromise of getting good enough latency when * there's data traffic and still saving significantly power in idle * periods. * * Dynamic powersave is simply supported by mac80211 enabling and disabling * PS based on traffic. Driver needs to only set %IEEE80211_HW_SUPPORTS_PS * flag and mac80211 will handle everything automatically. Additionally, * hardware having support for the dynamic PS feature may set the * %IEEE80211_HW_SUPPORTS_DYNAMIC_PS flag to indicate that it can support * dynamic PS mode itself. The driver needs to look at the * @dynamic_ps_timeout hardware configuration value and use it that value * whenever %IEEE80211_CONF_PS is set. In this case mac80211 will disable * dynamic PS feature in stack and will just keep %IEEE80211_CONF_PS * enabled whenever user has enabled powersave. * * Driver informs U-APSD client support by enabling * %IEEE80211_VIF_SUPPORTS_UAPSD flag. The mode is configured through the * uapsd parameter in conf_tx() operation. Hardware needs to send the QoS * Nullfunc frames and stay awake until the service period has ended. To * utilize U-APSD, dynamic powersave is disabled for voip AC and all frames * from that AC are transmitted with powersave enabled. * * Note: U-APSD client mode is not yet supported with * %IEEE80211_HW_PS_NULLFUNC_STACK. */ /** * DOC: Beacon filter support * * Some hardware have beacon filter support to reduce host cpu wakeups * which will reduce system power consumption. It usually works so that * the firmware creates a checksum of the beacon but omits all constantly * changing elements (TSF, TIM etc). Whenever the checksum changes the * beacon is forwarded to the host, otherwise it will be just dropped. That * way the host will only receive beacons where some relevant information * (for example ERP protection or WMM settings) have changed. * * Beacon filter support is advertised with the %IEEE80211_VIF_BEACON_FILTER * interface capability. The driver needs to enable beacon filter support * whenever power save is enabled, that is %IEEE80211_CONF_PS is set. When * power save is enabled, the stack will not check for beacon loss and the * driver needs to notify about loss of beacons with ieee80211_beacon_loss(). * * The time (or number of beacons missed) until the firmware notifies the * driver of a beacon loss event (which in turn causes the driver to call * ieee80211_beacon_loss()) should be configurable and will be controlled * by mac80211 and the roaming algorithm in the future. * * Since there may be constantly changing information elements that nothing * in the software stack cares about, we will, in the future, have mac80211 * tell the driver which information elements are interesting in the sense * that we want to see changes in them. This will include * * - a list of information element IDs * - a list of OUIs for the vendor information element * * Ideally, the hardware would filter out any beacons without changes in the * requested elements, but if it cannot support that it may, at the expense * of some efficiency, filter out only a subset. For example, if the device * doesn't support checking for OUIs it should pass up all changes in all * vendor information elements. * * Note that change, for the sake of simplification, also includes information * elements appearing or disappearing from the beacon. * * Some hardware supports an "ignore list" instead. Just make sure nothing * that was requested is on the ignore list, and include commonly changing * information element IDs in the ignore list, for example 11 (BSS load) and * the various vendor-assigned IEs with unknown contents (128, 129, 133-136, * 149, 150, 155, 156, 173, 176, 178, 179, 219); for forward compatibility * it could also include some currently unused IDs. * * * In addition to these capabilities, hardware should support notifying the * host of changes in the beacon RSSI. This is relevant to implement roaming * when no traffic is flowing (when traffic is flowing we see the RSSI of * the received data packets). This can consist of notifying the host when * the RSSI changes significantly or when it drops below or rises above * configurable thresholds. In the future these thresholds will also be * configured by mac80211 (which gets them from userspace) to implement * them as the roaming algorithm requires. * * If the hardware cannot implement this, the driver should ask it to * periodically pass beacon frames to the host so that software can do the * signal strength threshold checking. */ /** * DOC: Spatial multiplexing power save * * SMPS (Spatial multiplexing power save) is a mechanism to conserve * power in an 802.11n implementation. For details on the mechanism * and rationale, please refer to 802.11 (as amended by 802.11n-2009) * "11.2.3 SM power save". * * The mac80211 implementation is capable of sending action frames * to update the AP about the station's SMPS mode, and will instruct * the driver to enter the specific mode. It will also announce the * requested SMPS mode during the association handshake. Hardware * support for this feature is required, and can be indicated by * hardware flags. * * The default mode will be "automatic", which nl80211/cfg80211 * defines to be dynamic SMPS in (regular) powersave, and SMPS * turned off otherwise. * * To support this feature, the driver must set the appropriate * hardware support flags, and handle the SMPS flag to the config() * operation. It will then with this mechanism be instructed to * enter the requested SMPS mode while associated to an HT AP. */ /** * DOC: Frame filtering * * mac80211 requires to see many management frames for proper * operation, and users may want to see many more frames when * in monitor mode. However, for best CPU usage and power consumption, * having as few frames as possible percolate through the stack is * desirable. Hence, the hardware should filter as much as possible. * * To achieve this, mac80211 uses filter flags (see below) to tell * the driver's configure_filter() function which frames should be * passed to mac80211 and which should be filtered out. * * Before configure_filter() is invoked, the prepare_multicast() * callback is invoked with the parameters @mc_count and @mc_list * for the combined multicast address list of all virtual interfaces. * It's use is optional, and it returns a u64 that is passed to * configure_filter(). Additionally, configure_filter() has the * arguments @changed_flags telling which flags were changed and * @total_flags with the new flag states. * * If your device has no multicast address filters your driver will * need to check both the %FIF_ALLMULTI flag and the @mc_count * parameter to see whether multicast frames should be accepted * or dropped. * * All unsupported flags in @total_flags must be cleared. * Hardware does not support a flag if it is incapable of _passing_ * the frame to the stack. Otherwise the driver must ignore * the flag, but not clear it. * You must _only_ clear the flag (announce no support for the * flag to mac80211) if you are not able to pass the packet type * to the stack (so the hardware always filters it). * So for example, you should clear @FIF_CONTROL, if your hardware * always filters control frames. If your hardware always passes * control frames to the kernel and is incapable of filtering them, * you do _not_ clear the @FIF_CONTROL flag. * This rule applies to all other FIF flags as well. */ /** * DOC: AP support for powersaving clients * * In order to implement AP and P2P GO modes, mac80211 has support for * client powersaving, both "legacy" PS (PS-Poll/null data) and uAPSD. * There currently is no support for sAPSD. * * There is one assumption that mac80211 makes, namely that a client * will not poll with PS-Poll and trigger with uAPSD at the same time. * Both are supported, and both can be used by the same client, but * they can't be used concurrently by the same client. This simplifies * the driver code. * * The first thing to keep in mind is that there is a flag for complete * driver implementation: %IEEE80211_HW_AP_LINK_PS. If this flag is set, * mac80211 expects the driver to handle most of the state machine for * powersaving clients and will ignore the PM bit in incoming frames. * Drivers then use ieee80211_sta_ps_transition() to inform mac80211 of * stations' powersave transitions. In this mode, mac80211 also doesn't * handle PS-Poll/uAPSD. * * In the mode without %IEEE80211_HW_AP_LINK_PS, mac80211 will check the * PM bit in incoming frames for client powersave transitions. When a * station goes to sleep, we will stop transmitting to it. There is, * however, a race condition: a station might go to sleep while there is * data buffered on hardware queues. If the device has support for this * it will reject frames, and the driver should give the frames back to * mac80211 with the %IEEE80211_TX_STAT_TX_FILTERED flag set which will * cause mac80211 to retry the frame when the station wakes up. The * driver is also notified of powersave transitions by calling its * @sta_notify callback. * * When the station is asleep, it has three choices: it can wake up, * it can PS-Poll, or it can possibly start a uAPSD service period. * Waking up is implemented by simply transmitting all buffered (and * filtered) frames to the station. This is the easiest case. When * the station sends a PS-Poll or a uAPSD trigger frame, mac80211 * will inform the driver of this with the @allow_buffered_frames * callback; this callback is optional. mac80211 will then transmit * the frames as usual and set the %IEEE80211_TX_CTL_NO_PS_BUFFER * on each frame. The last frame in the service period (or the only * response to a PS-Poll) also has %IEEE80211_TX_STATUS_EOSP set to * indicate that it ends the service period; as this frame must have * TX status report it also sets %IEEE80211_TX_CTL_REQ_TX_STATUS. * When TX status is reported for this frame, the service period is * marked has having ended and a new one can be started by the peer. * * Additionally, non-bufferable MMPDUs can also be transmitted by * mac80211 with the %IEEE80211_TX_CTL_NO_PS_BUFFER set in them. * * Another race condition can happen on some devices like iwlwifi * when there are frames queued for the station and it wakes up * or polls; the frames that are already queued could end up being * transmitted first instead, causing reordering and/or wrong * processing of the EOSP. The cause is that allowing frames to be * transmitted to a certain station is out-of-band communication to * the device. To allow this problem to be solved, the driver can * call ieee80211_sta_block_awake() if frames are buffered when it * is notified that the station went to sleep. When all these frames * have been filtered (see above), it must call the function again * to indicate that the station is no longer blocked. * * If the driver buffers frames in the driver for aggregation in any * way, it must use the ieee80211_sta_set_buffered() call when it is * notified of the station going to sleep to inform mac80211 of any * TIDs that have frames buffered. Note that when a station wakes up * this information is reset (hence the requirement to call it when * informed of the station going to sleep). Then, when a service * period starts for any reason, @release_buffered_frames is called * with the number of frames to be released and which TIDs they are * to come from. In this case, the driver is responsible for setting * the EOSP (for uAPSD) and MORE_DATA bits in the released frames. * To help the @more_data parameter is passed to tell the driver if * there is more data on other TIDs -- the TIDs to release frames * from are ignored since mac80211 doesn't know how many frames the * buffers for those TIDs contain. * * If the driver also implement GO mode, where absence periods may * shorten service periods (or abort PS-Poll responses), it must * filter those response frames except in the case of frames that * are buffered in the driver -- those must remain buffered to avoid * reordering. Because it is possible that no frames are released * in this case, the driver must call ieee80211_sta_eosp() * to indicate to mac80211 that the service period ended anyway. * * Finally, if frames from multiple TIDs are released from mac80211 * but the driver might reorder them, it must clear & set the flags * appropriately (only the last frame may have %IEEE80211_TX_STATUS_EOSP) * and also take care of the EOSP and MORE_DATA bits in the frame. * The driver may also use ieee80211_sta_eosp() in this case. * * Note that if the driver ever buffers frames other than QoS-data * frames, it must take care to never send a non-QoS-data frame as * the last frame in a service period, adding a QoS-nulldata frame * after a non-QoS-data frame if needed. */ /** * DOC: HW queue control * * Before HW queue control was introduced, mac80211 only had a single static * assignment of per-interface AC software queues to hardware queues. This * was problematic for a few reasons: * 1) off-channel transmissions might get stuck behind other frames * 2) multiple virtual interfaces couldn't be handled correctly * 3) after-DTIM frames could get stuck behind other frames * * To solve this, hardware typically uses multiple different queues for all * the different usages, and this needs to be propagated into mac80211 so it * won't have the same problem with the software queues. * * Therefore, mac80211 now offers the %IEEE80211_HW_QUEUE_CONTROL capability * flag that tells it that the driver implements its own queue control. To do * so, the driver will set up the various queues in each &struct ieee80211_vif * and the offchannel queue in &struct ieee80211_hw. In response, mac80211 will * use those queue IDs in the hw_queue field of &struct ieee80211_tx_info and * if necessary will queue the frame on the right software queue that mirrors * the hardware queue. * Additionally, the driver has to then use these HW queue IDs for the queue * management functions (ieee80211_stop_queue() et al.) * * The driver is free to set up the queue mappings as needed; multiple virtual * interfaces may map to the same hardware queues if needed. The setup has to * happen during add_interface or change_interface callbacks. For example, a * driver supporting station+station and station+AP modes might decide to have * 10 hardware queues to handle different scenarios: * * 4 AC HW queues for 1st vif: 0, 1, 2, 3 * 4 AC HW queues for 2nd vif: 4, 5, 6, 7 * after-DTIM queue for AP: 8 * off-channel queue: 9 * * It would then set up the hardware like this: * hw.offchannel_tx_hw_queue = 9 * * and the first virtual interface that is added as follows: * vif.hw_queue[IEEE80211_AC_VO] = 0 * vif.hw_queue[IEEE80211_AC_VI] = 1 * vif.hw_queue[IEEE80211_AC_BE] = 2 * vif.hw_queue[IEEE80211_AC_BK] = 3 * vif.cab_queue = 8 // if AP mode, otherwise %IEEE80211_INVAL_HW_QUEUE * and the second virtual interface with 4-7. * * If queue 6 gets full, for example, mac80211 would only stop the second * virtual interface's BE queue since virtual interface queues are per AC. * * Note that the vif.cab_queue value should be set to %IEEE80211_INVAL_HW_QUEUE * whenever the queue is not used (i.e. the interface is not in AP mode) if the * queue could potentially be shared since mac80211 will look at cab_queue when * a queue is stopped/woken even if the interface is not in AP mode. */ /** * enum ieee80211_filter_flags - hardware filter flags * * These flags determine what the filter in hardware should be * programmed to let through and what should not be passed to the * stack. It is always safe to pass more frames than requested, * but this has negative impact on power consumption. * * @FIF_ALLMULTI: pass all multicast frames, this is used if requested * by the user or if the hardware is not capable of filtering by * multicast address. * * @FIF_FCSFAIL: pass frames with failed FCS (but you need to set the * %RX_FLAG_FAILED_FCS_CRC for them) * * @FIF_PLCPFAIL: pass frames with failed PLCP CRC (but you need to set * the %RX_FLAG_FAILED_PLCP_CRC for them * * @FIF_BCN_PRBRESP_PROMISC: This flag is set during scanning to indicate * to the hardware that it should not filter beacons or probe responses * by BSSID. Filtering them can greatly reduce the amount of processing * mac80211 needs to do and the amount of CPU wakeups, so you should * honour this flag if possible. * * @FIF_CONTROL: pass control frames (except for PS Poll) addressed to this * station * * @FIF_OTHER_BSS: pass frames destined to other BSSes * * @FIF_PSPOLL: pass PS Poll frames * * @FIF_PROBE_REQ: pass probe request frames * * @FIF_MCAST_ACTION: pass multicast Action frames */ enum ieee80211_filter_flags { FIF_ALLMULTI = 1<<1, FIF_FCSFAIL = 1<<2, FIF_PLCPFAIL = 1<<3, FIF_BCN_PRBRESP_PROMISC = 1<<4, FIF_CONTROL = 1<<5, FIF_OTHER_BSS = 1<<6, FIF_PSPOLL = 1<<7, FIF_PROBE_REQ = 1<<8, FIF_MCAST_ACTION = 1<<9, }; /** * enum ieee80211_ampdu_mlme_action - A-MPDU actions * * These flags are used with the ampdu_action() callback in * &struct ieee80211_ops to indicate which action is needed. * * Note that drivers MUST be able to deal with a TX aggregation * session being stopped even before they OK'ed starting it by * calling ieee80211_start_tx_ba_cb_irqsafe, because the peer * might receive the addBA frame and send a delBA right away! * * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either * call ieee80211_start_tx_ba_cb_irqsafe() or * call ieee80211_start_tx_ba_cb_irqsafe() with status * %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after * ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting * queued packets, now unaggregated. After all packets are transmitted the * driver has to call ieee80211_stop_tx_ba_cb_irqsafe(). * @IEEE80211_AMPDU_TX_STOP_FLUSH: stop TX aggregation and flush all packets, * called when the station is removed. There's no need or reason to call * ieee80211_stop_tx_ba_cb_irqsafe() in this case as mac80211 assumes the * session is gone and removes the station. * @IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: called when TX aggregation is stopped * but the driver hasn't called ieee80211_stop_tx_ba_cb_irqsafe() yet and * now the connection is dropped and the station will be removed. Drivers * should clean up and drop remaining packets when this is called. */ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_RX_START, IEEE80211_AMPDU_RX_STOP, IEEE80211_AMPDU_TX_START, IEEE80211_AMPDU_TX_STOP_CONT, IEEE80211_AMPDU_TX_STOP_FLUSH, IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, IEEE80211_AMPDU_TX_OPERATIONAL, }; #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 #define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2 /** * struct ieee80211_ampdu_params - AMPDU action parameters * * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action. * @sta: peer of this AMPDU session * @tid: tid of the BA session * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When * action is set to %IEEE80211_AMPDU_RX_START the driver passes back the * actual ssn value used to start the session and writes the value here. * @buf_size: reorder buffer size (number of subframes). Valid only when the * action is set to %IEEE80211_AMPDU_RX_START or * %IEEE80211_AMPDU_TX_OPERATIONAL * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU. * valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL * @timeout: BA session timeout. Valid only when the action is set to * %IEEE80211_AMPDU_RX_START */ struct ieee80211_ampdu_params { enum ieee80211_ampdu_mlme_action action; struct ieee80211_sta *sta; u16 tid; u16 ssn; u16 buf_size; bool amsdu; u16 timeout; }; /** * enum ieee80211_frame_release_type - frame release reason * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll * @IEEE80211_FRAME_RELEASE_UAPSD: frame(s) released due to * frame received on trigger-enabled AC */ enum ieee80211_frame_release_type { IEEE80211_FRAME_RELEASE_PSPOLL, IEEE80211_FRAME_RELEASE_UAPSD, }; /** * enum ieee80211_rate_control_changed - flags to indicate what changed * * @IEEE80211_RC_BW_CHANGED: The bandwidth that can be used to transmit * to this station changed. The actual bandwidth is in the station * information -- for HT20/40 the IEEE80211_HT_CAP_SUP_WIDTH_20_40 * flag changes, for HT and VHT the bandwidth field changes. * @IEEE80211_RC_SMPS_CHANGED: The SMPS state of the station changed. * @IEEE80211_RC_SUPP_RATES_CHANGED: The supported rate set of this peer * changed (in IBSS mode) due to discovering more information about * the peer. * @IEEE80211_RC_NSS_CHANGED: N_SS (number of spatial streams) was changed * by the peer */ enum ieee80211_rate_control_changed { IEEE80211_RC_BW_CHANGED = BIT(0), IEEE80211_RC_SMPS_CHANGED = BIT(1), IEEE80211_RC_SUPP_RATES_CHANGED = BIT(2), IEEE80211_RC_NSS_CHANGED = BIT(3), }; /** * enum ieee80211_roc_type - remain on channel type * * With the support for multi channel contexts and multi channel operations, * remain on channel operations might be limited/deferred/aborted by other * flows/operations which have higher priority (and vice versa). * Specifying the ROC type can be used by devices to prioritize the ROC * operations compared to other operations/flows. * * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC. * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required * for sending management frames offchannel. */ enum ieee80211_roc_type { IEEE80211_ROC_TYPE_NORMAL = 0, IEEE80211_ROC_TYPE_MGMT_TX, }; /** * enum ieee80211_reconfig_type - reconfig type * * This enum is used by the reconfig_complete() callback to indicate what * reconfiguration type was completed. * * @IEEE80211_RECONFIG_TYPE_RESTART: hw restart type * (also due to resume() callback returning 1) * @IEEE80211_RECONFIG_TYPE_SUSPEND: suspend type (regardless * of wowlan configuration) */ enum ieee80211_reconfig_type { IEEE80211_RECONFIG_TYPE_RESTART, IEEE80211_RECONFIG_TYPE_SUSPEND, }; /** * struct ieee80211_prep_tx_info - prepare TX information * @duration: if non-zero, hint about the required duration, * only used with the mgd_prepare_tx() method. * @subtype: frame subtype (auth, (re)assoc, deauth, disassoc) * @success: whether the frame exchange was successful, only * used with the mgd_complete_tx() method, and then only * valid for auth and (re)assoc. * @link_id: the link id on which the frame will be TX'ed. * Only used with the mgd_prepare_tx() method. */ struct ieee80211_prep_tx_info { u16 duration; u16 subtype; u8 success:1; int link_id; }; /** * struct ieee80211_ops - callbacks from mac80211 to the driver * * This structure contains various callbacks that the driver may * handle or, in some cases, must handle, for example to configure * the hardware to a new channel or to transmit a frame. * * @tx: Handler that 802.11 module calls for each transmitted frame. * skb contains the buffer starting from the IEEE 802.11 header. * The low-level driver should send the frame out based on * configuration in the TX control data. This handler should, * preferably, never fail and stop queues appropriately. * Must be atomic. * * @start: Called before the first netdevice attached to the hardware * is enabled. This should turn on the hardware and must turn on * frame reception (for possibly enabled monitor interfaces.) * Returns negative error codes, these may be seen in userspace, * or zero. * When the device is started it should not have a MAC address * to avoid acknowledging frames before a non-monitor device * is added. * Must be implemented and can sleep. * * @stop: Called after last netdevice attached to the hardware * is disabled. This should turn off the hardware (at least * it must turn off frame reception.) * May be called right after add_interface if that rejects * an interface. If you added any work onto the mac80211 workqueue * you should ensure to cancel it on this callback. * Must be implemented and can sleep. * * @suspend: Suspend the device; mac80211 itself will quiesce before and * stop transmitting and doing any other configuration, and then * ask the device to suspend. This is only invoked when WoWLAN is * configured, otherwise the device is deconfigured completely and * reconfigured at resume time. * The driver may also impose special conditions under which it * wants to use the "normal" suspend (deconfigure), say if it only * supports WoWLAN when the device is associated. In this case, it * must return 1 from this function. * * @resume: If WoWLAN was configured, this indicates that mac80211 is * now resuming its operation, after this the device must be fully * functional again. If this returns an error, the only way out is * to also unregister the device. If it returns 1, then mac80211 * will also go through the regular complete restart on resume. * * @set_wakeup: Enable or disable wakeup when WoWLAN configuration is * modified. The reason is that device_set_wakeup_enable() is * supposed to be called when the configuration changes, not only * in suspend(). * * @add_interface: Called when a netdevice attached to the hardware is * enabled. Because it is not called for monitor mode devices, @start * and @stop must be implemented. * The driver should perform any initialization it needs before * the device can be enabled. The initial configuration for the * interface is given in the conf parameter. * The callback may refuse to add an interface by returning a * negative error code (which will be seen in userspace.) * Must be implemented and can sleep. * * @change_interface: Called when a netdevice changes type. This callback * is optional, but only if it is supported can interface types be * switched while the interface is UP. The callback may sleep. * Note that while an interface is being switched, it will not be * found by the interface iteration callbacks. * * @remove_interface: Notifies a driver that an interface is going down. * The @stop callback is called after this if it is the last interface * and no monitor interfaces are present. * When all interfaces are removed, the MAC address in the hardware * must be cleared so the device no longer acknowledges packets, * the mac_addr member of the conf structure is, however, set to the * MAC address of the device going away. * Hence, this callback must be implemented. It can sleep. * * @config: Handler for configuration requests. IEEE 802.11 code calls this * function to change hardware configuration, e.g., channel. * This function should never fail but returns a negative error code * if it does. The callback can sleep. * * @bss_info_changed: Handler for configuration requests related to BSS * parameters that may vary during BSS's lifespan, and may affect low * level driver (e.g. assoc/disassoc status, erp parameters). * This function should not be used if no BSS has been set, unless * for association indication. The @changed parameter indicates which * of the bss parameters has changed when a call is made. The callback * can sleep. * Note: this callback is called if @vif_cfg_changed or @link_info_changed * are not implemented. * * @vif_cfg_changed: Handler for configuration requests related to interface * (MLD) parameters from &struct ieee80211_vif_cfg that vary during the * lifetime of the interface (e.g. assoc status, IP addresses, etc.) * The @changed parameter indicates which value changed. * The callback can sleep. * * @link_info_changed: Handler for configuration requests related to link * parameters from &struct ieee80211_bss_conf that are related to an * individual link. e.g. legacy/HT/VHT/... rate information. * The @changed parameter indicates which value changed, and the @link_id * parameter indicates the link ID. Note that the @link_id will be 0 for * non-MLO connections. * The callback can sleep. * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed * to configure_filter(). This callback must be atomic. * * @configure_filter: Configure the device's RX filter. * See the section "Frame filtering" for more information. * This callback must be implemented and can sleep. * * @config_iface_filter: Configure the interface's RX filter. * This callback is optional and is used to configure which frames * should be passed to mac80211. The filter_flags is the combination * of FIF_* flags. The changed_flags is a bit mask that indicates * which flags are changed. * This callback can sleep. * * @set_tim: Set TIM bit. mac80211 calls this function when a TIM bit * must be set or cleared for a given STA. Must be atomic. * * @set_key: See the section "Hardware crypto acceleration" * This callback is only called between add_interface and * remove_interface calls, i.e. while the given virtual interface * is enabled. * Returns a negative error code if the key can't be added. * The callback can sleep. * * @update_tkip_key: See the section "Hardware crypto acceleration" * This callback will be called in the context of Rx. Called for drivers * which set IEEE80211_KEY_FLAG_TKIP_REQ_RX_P1_KEY. * The callback must be atomic. * * @set_rekey_data: If the device supports GTK rekeying, for example while the * host is suspended, it can assign this callback to retrieve the data * necessary to do GTK rekeying, this is the KEK, KCK and replay counter. * After rekeying was done it should (for example during resume) notify * userspace of the new replay counter using ieee80211_gtk_rekey_notify(). * * @set_default_unicast_key: Set the default (unicast) key index, useful for * WEP when the device sends data packets autonomously, e.g. for ARP * offloading. The index can be 0-3, or -1 for unsetting it. * * @hw_scan: Ask the hardware to service the scan request, no need to start * the scan state machine in stack. The scan must honour the channel * configuration done by the regulatory agent in the wiphy's * registered bands. The hardware (or the driver) needs to make sure * that power save is disabled. * The @req ie/ie_len members are rewritten by mac80211 to contain the * entire IEs after the SSID, so that drivers need not look at these * at all but just send them after the SSID -- mac80211 includes the * (extended) supported rates and HT information (where applicable). * When the scan finishes, ieee80211_scan_completed() must be called; * note that it also must be called when the scan cannot finish due to * any error unless this callback returned a negative error code. * This callback is also allowed to return the special return value 1, * this indicates that hardware scan isn't desirable right now and a * software scan should be done instead. A driver wishing to use this * capability must ensure its (hardware) scan capabilities aren't * advertised as more capable than mac80211's software scan is. * The callback can sleep. * * @cancel_hw_scan: Ask the low-level tp cancel the active hw scan. * The driver should ask the hardware to cancel the scan (if possible), * but the scan will be completed only after the driver will call * ieee80211_scan_completed(). * This callback is needed for wowlan, to prevent enqueueing a new * scan_work after the low-level driver was already suspended. * The callback can sleep. * * @sched_scan_start: Ask the hardware to start scanning repeatedly at * specific intervals. The driver must call the * ieee80211_sched_scan_results() function whenever it finds results. * This process will continue until sched_scan_stop is called. * * @sched_scan_stop: Tell the hardware to stop an ongoing scheduled scan. * In this case, ieee80211_sched_scan_stopped() must not be called. * * @sw_scan_start: Notifier function that is called just before a software scan * is started. Can be NULL, if the driver doesn't need this notification. * The mac_addr parameter allows supporting NL80211_SCAN_FLAG_RANDOM_ADDR, * the driver may set the NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR flag if it * can use this parameter. The callback can sleep. * * @sw_scan_complete: Notifier function that is called just after a * software scan finished. Can be NULL, if the driver doesn't need * this notification. * The callback can sleep. * * @get_stats: Return low-level statistics. * Returns zero if statistics are available. * The callback can sleep. * * @get_key_seq: If your device implements encryption in hardware and does * IV/PN assignment then this callback should be provided to read the * IV/PN for the given key from hardware. * The callback must be atomic. * * @set_frag_threshold: Configuration of fragmentation threshold. Assign this * if the device does fragmentation by itself. Note that to prevent the * stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG * should be set as well. * The callback can sleep. * * @set_rts_threshold: Configuration of RTS threshold (if device needs it) * The callback can sleep. * * @sta_add: Notifies low level driver about addition of an associated station, * AP, IBSS/WDS/mesh peer etc. This callback can sleep. * * @sta_remove: Notifies low level driver about removal of an associated * station, AP, IBSS/WDS/mesh peer etc. Note that after the callback * returns it isn't safe to use the pointer, not even RCU protected; * no RCU grace period is guaranteed between returning here and freeing * the station. See @sta_pre_rcu_remove if needed. * This callback can sleep. * * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif * directory with its files. This callback should be within a * CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * * @link_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 vif. This callback should be within * a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * For non-MLO the callback will be called once for the default bss_conf * with the vif's directory rather than a separate subdirectory. * * @sta_add_debugfs: Drivers can use this callback to add debugfs files * when a station is added to mac80211's station list. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 station. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * For non-MLO the callback will be called once for the deflink with the * station's directory rather than a separate subdirectory. * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag * %IEEE80211_HW_AP_LINK_PS is set. Must be atomic. * * @sta_set_txpwr: Configure the station tx power. This callback set the tx * power for the station. * This callback can sleep. * * @sta_state: Notifies low level driver about state transition of a * station (which can be the AP, a client, IBSS/WDS/mesh peer etc.) * This callback is mutually exclusive with @sta_add/@sta_remove. * It must not fail for down transitions but may fail for transitions * up the list of states. Also note that after the callback returns it * isn't safe to use the pointer, not even RCU protected - no RCU grace * period is guaranteed between returning here and freeing the station. * See @sta_pre_rcu_remove if needed. * The callback can sleep. * * @sta_pre_rcu_remove: Notify driver about station removal before RCU * synchronisation. This is useful if a driver needs to have station * pointers protected using RCU, it can then use this call to clear * the pointers instead of waiting for an RCU grace period to elapse * in @sta_state. * The callback can sleep. * * @sta_rc_update: Notifies the driver of changes to the bitrates that can be * used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since * otherwise the rate control algorithm is notified directly. * Must be atomic. * @sta_rate_tbl_update: Notifies the driver that the rate table changed. This * is only used if the configured rate control algorithm actually uses * the new rate table API, and is therefore optional. Must be atomic. * * @sta_statistics: Get statistics for this station. For example with beacon * filtering, the statistics kept by mac80211 might not be accurate, so * let the driver pre-fill the statistics. The driver can fill most of * the values (indicating which by setting the filled bitmap), but not * all of them make sense - see the source for which ones are possible. * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. * The callback can sleep. * * @get_tsf: Get the current TSF timer value from firmware/hardware. Currently, * this is only used for IBSS mode BSSID merging and debugging. Is not a * required function. * The callback can sleep. * * @set_tsf: Set the TSF timer to the specified value in the firmware/hardware. * Currently, this is only used for IBSS mode debugging. Is not a * required function. * The callback can sleep. * * @offset_tsf: Offset the TSF timer by the specified value in the * firmware/hardware. Preferred to set_tsf as it avoids delay between * calling set_tsf() and hardware getting programmed, which will show up * as TSF delay. Is not a required function. * The callback can sleep. * * @reset_tsf: Reset the TSF timer and allow firmware/hardware to synchronize * with other STAs in the IBSS. This is only used in IBSS mode. This * function is optional if the firmware/hardware takes full care of * TSF synchronization. * The callback can sleep. * * @tx_last_beacon: Determine whether the last IBSS beacon was sent by us. * This is needed only for IBSS mode and the result of this function is * used to determine whether to reply to Probe Requests. * Returns non-zero if this device sent the last beacon. * The callback can sleep. * * @get_survey: Return per-channel survey information * * @rfkill_poll: Poll rfkill hardware state. If you need this, you also * need to set wiphy->rfkill_poll to %true before registration, * and need to call wiphy_rfkill_set_hw_state() in the callback. * The callback can sleep. * * @set_coverage_class: Set slot time for given coverage class as specified * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout * accordingly; coverage class equals to -1 to enable ACK timeout * estimation algorithm (dynack). To disable dynack set valid value for * coverage class. This callback is not required and may sleep. * * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may * be %NULL. The callback can sleep. * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep. * * @flush: Flush all pending frames from the hardware queue, making sure * that the hardware queues are empty. The @queues parameter is a bitmap * of queues to flush, which is useful if different virtual interfaces * use different hardware queues; it may also indicate all queues. * If the parameter @drop is set to %true, pending frames may be dropped. * Note that vif can be NULL. * The callback can sleep. * * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for * the given station, as it's about to be removed. * The callback can sleep. * * @channel_switch: Drivers that need (or want) to offload the channel * switch operation for CSAs received from the AP may implement this * callback. They must then call ieee80211_chswitch_done() to indicate * completion of the channel switch. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may * reject TX/RX mask combinations they cannot support by returning -EINVAL * (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX). * * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant). * * @remain_on_channel: Starts an off-channel period on the given channel, must * call back to ieee80211_ready_on_channel() when on that channel. Note * that normal channel traffic is not stopped as this is intended for hw * offload. Frames to transmit on the off-channel channel are transmitted * normally except for the %IEEE80211_TX_CTL_TX_OFFCHAN flag. When the * duration (which will always be non-zero) expires, the driver must call * ieee80211_remain_on_channel_expired(). * Note that this callback may be called while the device is in IDLE and * must be accepted in this case. * This callback may sleep. * @cancel_remain_on_channel: Requests that an ongoing off-channel period is * aborted before it expires. This callback may sleep. * * @set_ringparam: Set tx and rx ring sizes. * * @get_ringparam: Get tx and rx ring current and maximum sizes. * * @tx_frames_pending: Check if there is any pending frame in the hardware * queues before entering power save. * * @set_bitrate_mask: Set a mask of rates to be used for rate control selection * when transmitting a frame. Currently only legacy rates are handled. * The callback can sleep. * @event_callback: Notify driver about any event in mac80211. See * &enum ieee80211_event_type for the different types. * The callback must be atomic. * * @release_buffered_frames: Release buffered frames according to the given * parameters. In the case where the driver buffers some frames for * sleeping stations mac80211 will use this callback to tell the driver * to release some frames, either for PS-poll or uAPSD. * Note that if the @more_data parameter is %false the driver must check * if there are more frames on the given TIDs, and if there are more than * the frames being released then it must still set the more-data bit in * the frame. If the @more_data parameter is %true, then of course the * more-data bit must always be set. * The @tids parameter tells the driver which TIDs to release frames * from, for PS-poll it will always have only a single bit set. * In the case this is used for a PS-poll initiated release, the * @num_frames parameter will always be 1 so code can be shared. In * this case the driver must also set %IEEE80211_TX_STATUS_EOSP flag * on the TX status (and must report TX status) so that the PS-poll * period is properly ended. This is used to avoid sending multiple * responses for a retried PS-poll frame. * In the case this is used for uAPSD, the @num_frames parameter may be * bigger than one, but the driver may send fewer frames (it must send * at least one, however). In this case it is also responsible for * setting the EOSP flag in the QoS header of the frames. Also, when the * service period ends, the driver must set %IEEE80211_TX_STATUS_EOSP * on the last frame in the SP. Alternatively, it may call the function * ieee80211_sta_eosp() to inform mac80211 of the end of the SP. * This callback must be atomic. * @allow_buffered_frames: Prepare device to allow the given number of frames * to go out to the given station. The frames will be sent by mac80211 * via the usual TX path after this call. The TX information for frames * released will also have the %IEEE80211_TX_CTL_NO_PS_BUFFER flag set * and the last one will also have %IEEE80211_TX_STATUS_EOSP set. In case * frames from multiple TIDs are released and the driver might reorder * them between the TIDs, it must set the %IEEE80211_TX_STATUS_EOSP flag * on the last frame and clear it on all others and also handle the EOSP * bit in the QoS header correctly. Alternatively, it can also call the * ieee80211_sta_eosp() function. * The @tids parameter is a bitmap and tells the driver which TIDs the * frames will be on; it will at most have two bits set. * This callback must be atomic. * * @get_et_sset_count: Ethtool API to get string-set count. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @get_et_stats: Ethtool API to get a set of u64 stats. * * @get_et_strings: Ethtool API to get a set of strings to describe stats * and perhaps other supported types of ethtool data-sets. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @mgd_prepare_tx: Prepare for transmitting a management frame for association * before associated. In multi-channel scenarios, a virtual interface is * bound to a channel before it is associated, but as it isn't associated * yet it need not necessarily be given airtime, in particular since any * transmission to a P2P GO needs to be synchronized against the GO's * powersave state. mac80211 will call this function before transmitting a * management frame prior to having successfully associated to allow the * driver to give it channel time for the transmission, to get a response * and to be able to synchronize with the GO. * For drivers that set %IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, mac80211 * would also call this function before transmitting a deauthentication * frame in case that no beacon was heard from the AP/P2P GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. * Additional information is passed in the &struct ieee80211_prep_tx_info * data. If duration there is greater than zero, mac80211 hints to the * driver the duration for which the operation is requested. * The callback is optional and can (should!) sleep. * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's * channel. We must stay on the channel (no PSM, scan, etc.), since a TDLS * setup-response is a direct packet not buffered by the AP. * mac80211 will call this function just before the transmission of a TDLS * discovery-request. The recommended period of protection is at least * 2 * (DTIM period). * The callback is optional and can sleep. * * @add_chanctx: Notifies device driver about new channel context creation. * This callback may sleep. * @remove_chanctx: Notifies device driver about channel context destruction. * This callback may sleep. * @change_chanctx: Notifies device driver about channel context changes that * may happen when combining different virtual interfaces on the same * channel context with different settings * This callback may sleep. * @assign_vif_chanctx: Notifies device driver about channel context being bound * to vif. Possible use is for hw queue remapping. * This callback may sleep. * @unassign_vif_chanctx: Notifies device driver about channel context being * unbound from vif. * This callback may sleep. * @switch_vif_chanctx: switch a number of vifs from one chanctx to * another, as specified in the list of * @ieee80211_vif_chanctx_switch passed to the driver, according * to the mode defined in &ieee80211_chanctx_switch_mode. * This callback may sleep. * * @start_ap: Start operation on the AP interface, this is called after all the * information in bss_conf is set and beacon can be retrieved. A channel * context is bound before this is called. Note that if the driver uses * software scan or ROC, this (and @stop_ap) isn't called when the AP is * just "paused" for scanning/ROC, which is indicated by the beacon being * disabled/enabled via @bss_info_changed. * @stop_ap: Stop operation on the AP interface. * * @reconfig_complete: Called after a call to ieee80211_restart_hw() and * during resume, when the reconfiguration has completed. * This can help the driver implement the reconfiguration step (and * indicate mac80211 is ready to receive frames). * This callback may sleep. * * @ipv6_addr_change: IPv6 address assignment on the given interface changed. * Currently, this is only called for managed or P2P client interfaces. * This callback is optional; it must not sleep. * * @channel_switch_beacon: Starts a channel switch to a new channel. * Beacons are modified to include CSA or ECSA IEs before calling this * function. The corresponding count fields in these IEs must be * decremented, and when they reach 1 the driver must call * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() * get the csa counter decremented by mac80211, but must check if it is * 1 using ieee80211_beacon_counter_is_complete() after the beacon has been * transmitted and then call ieee80211_csa_finish(). * If the CSA count starts as zero or 1, this function will not be called, * since there won't be any time to beacon before the switch anyway. * @pre_channel_switch: This is an optional callback that is called * before a channel switch procedure is started (ie. when a STA * gets a CSA or a userspace initiated channel-switch), allowing * the driver to prepare for the channel switch. * @post_channel_switch: This is an optional callback that is called * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with * CSA IE was received, allowing driver to track changes in count. * @join_ibss: Join an IBSS (on an IBSS interface); this is called after all * information in bss_conf is set up and the beacon can be retrieved. A * channel context is bound before this is called. * @leave_ibss: Leave the IBSS again. * * @get_expected_throughput: extract the expected throughput towards the * specified station. The returned value is expressed in Kbps. It returns 0 * if the RC algorithm does not have proper data to provide. * * @get_txpower: get current maximum tx power (in dBm) based on configuration * and hardware limits. * * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver * is responsible for continually initiating channel-switching operations * and returning to the base channel for communication with the AP. The * driver receives a channel-switch request template and the location of * the switch-timing IE within the template as part of the invocation. * The template is valid only within the call, and the driver can * optionally copy the skb for further re-use. * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both * peers must be on the base channel when the call completes. * @tdls_recv_channel_switch: a TDLS channel-switch related frame (request or * response) has been received from a remote peer. The driver gets * parameters parsed from the incoming frame and may use them to continue * an ongoing channel-switch operation. In addition, a channel-switch * response template is provided, together with the location of the * switch-timing IE within the template. The skb can only be used within * the function call. * * @wake_tx_queue: Called when new packets have been added to the queue. * @sync_rx_queues: Process all pending frames in RSS queues. This is a * synchronization which is needed in case driver has in its RSS queues * pending frames that were received prior to the control path action * currently taken (e.g. disassociation) but are not processed yet. * * @start_nan: join an existing NAN cluster, or create a new one. * @stop_nan: leave the NAN cluster. * @nan_change_conf: change NAN configuration. The data in cfg80211_nan_conf * contains full new configuration and changes specify which parameters * are changed with respect to the last NAN config. * The driver gets both full configuration and the changed parameters since * some devices may need the full configuration while others need only the * changed parameters. * @add_nan_func: Add a NAN function. Returns 0 on success. The data in * cfg80211_nan_func must not be referenced outside the scope of * this call. * @del_nan_func: Remove a NAN function. The driver must call * ieee80211_nan_func_terminated() with * NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal. * @can_aggregate_in_amsdu: Called in order to determine if HW supports * aggregating two specific frames in the same A-MSDU. The relation * between the skbs should be symmetric and transitive. Note that while * skb is always a real frame, head may or may not be an A-MSDU. * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available. * Statistics should be cumulative, currently no way to reset is provided. * * @start_pmsr: start peer measurement (e.g. FTM) (this call can sleep) * @abort_pmsr: abort peer measurement (this call can sleep) * @set_tid_config: Apply TID specific configurations. This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer. * This callback may sleep. * @update_vif_offload: Update virtual interface offload flags * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode * @set_sar_specs: Update the SAR (TX power) settings. * @sta_set_decap_offload: Called to notify the driver when a station is allowed * to use rx decapsulation offload * @add_twt_setup: Update hw with TWT agreement parameters received from the peer. * This callback allows the hw to check if requested parameters * are supported and if there is enough room for a new agreement. * The hw is expected to set agreement result in the req_type field of * twt structure. * @twt_teardown_request: Update the hw with TWT teardown request received * from the peer. * @set_radar_background: Configure dedicated offchannel chain available for * radar/CAC detection on some hw. This chain can't be used to transmit * or receive frames and it is bounded to a running wdev. * Background radar/CAC detection allows to avoid the CAC downtime * switching to a different channel during CAC detection on the selected * radar channel. * The caller is expected to set chandef pointer to NULL in order to * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading * @can_activate_links: Checks if a specific active_links bitmap is * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. * Note that a sta can also be inserted or removed with valid links, * i.e. passed to @sta_add/@sta_state with sta->valid_links not zero. * In fact, cannot change from having valid_links and not having them. * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is * not restored at HW reset by mac80211 so drivers need to take care of * that. * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware * flow offloading for flows originating from the vif. * Note that the driver must not assume that the vif driver_data is valid * at this point, since the callback can be called during netdev teardown. * @can_neg_ttlm: for managed interface, requests the driver to determine * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb); int (*start)(struct ieee80211_hw *hw); void (*stop)(struct ieee80211_hw *hw); #ifdef CONFIG_PM int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); int (*resume)(struct ieee80211_hw *hw); void (*set_wakeup)(struct ieee80211_hw *hw, bool enabled); #endif int (*add_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*change_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*config)(struct ieee80211_hw *hw, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); void (*vif_cfg_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed); void (*link_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); void (*configure_filter)(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 multicast); void (*config_iface_filter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int filter_flags, unsigned int changed_flags); int (*set_tim)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set); int (*set_key)(struct ieee80211_hw *hw, enum set_key_cmd cmd, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); void (*update_tkip_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_key_conf *conf, struct ieee80211_sta *sta, u32 iv32, u16 *phase1key); void (*set_rekey_data)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_gtk_rekey_data *data); void (*set_default_unicast_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx); int (*hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_scan_request *req); void (*cancel_hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*sched_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies); int (*sched_scan_stop)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sw_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const u8 *mac_addr); void (*sw_scan_complete)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*get_stats)(struct ieee80211_hw *hw, struct ieee80211_low_level_stats *stats); void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); #ifdef CONFIG_MAC80211_DEBUGFS void (*vif_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*link_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct dentry *dir); void (*sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); int (*sta_set_txpwr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*sta_rc_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*sta_statistics)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct station_info *sinfo); int (*conf_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 ac, const struct ieee80211_tx_queue_params *params); u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf); void (*offset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); /** * @ampdu_action: * Perform a certain A-MPDU action. * The RA/TID combination determines the destination and TID we want * the ampdu action to be performed for. The action is defined through * ieee80211_ampdu_mlme_action. * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver * may neither send aggregates containing more subframes than @buf_size * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be * possible with a buf_size of 8: * * - ``TX: 1.....7`` * - ``RX: 2....7`` (lost frame #1) * - ``TX: 8..1...`` * * which is invalid since #1 was now re-transmitted well past the * buffer size of 8. Correct ways to retransmit #1 would be: * * - ``TX: 1 or`` * - ``TX: 18 or`` * - ``TX: 81`` * * Even ``189`` would be wrong since 1 could be lost again. * * Returns a negative error code on failure. The driver may return * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START * if the session can start immediately. * * The callback can sleep. */ int (*ampdu_action)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params); int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len); #endif void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type); int (*cancel_remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*set_ringparam)(struct ieee80211_hw *hw, u32 tx, u32 rx); void (*get_ringparam)(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); bool (*tx_frames_pending)(struct ieee80211_hw *hw); int (*set_bitrate_mask)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_bitrate_mask *mask); void (*event_callback)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct ieee80211_event *event); void (*allow_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); void (*release_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); int (*get_et_sset_count)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int sset); void (*get_et_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ethtool_stats *stats, u64 *data); void (*get_et_strings)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 sset, u8 *data); void (*mgd_prepare_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_complete_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); int (*add_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*remove_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*change_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int (*assign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); void (*unassign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); int (*switch_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); void (*reconfig_complete)(struct ieee80211_hw *hw, enum ieee80211_reconfig_type reconfig_type); #if IS_ENABLED(CONFIG_IPV6) void (*ipv6_addr_change)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct inet6_dev *idev); #endif void (*channel_switch_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_chan_def *chandef); int (*pre_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*post_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie); void (*tdls_cancel_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*tdls_recv_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_tdls_ch_sw_params *params); void (*wake_tx_queue)(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void (*sync_rx_queues)(struct ieee80211_hw *hw); int (*start_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf); int (*stop_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*nan_change_conf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf, u32 changes); int (*add_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_nan_func *nan_func); void (*del_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 instance_id); bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw, struct sk_buff *head, struct sk_buff *skb); int (*get_ftm_responder_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_ftm_responder_stats *ftm_stats); int (*start_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); int (*set_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf); int (*reset_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 tids); void (*update_vif_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); int (*set_sar_specs)(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); void (*sta_set_decap_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); void (*add_twt_setup)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt); void (*twt_teardown_request)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u8 flowid); int (*set_radar_background)(struct ieee80211_hw *hw, struct cfg80211_chan_def *chandef); int (*net_fill_forward_path)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); bool (*can_activate_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); int (*change_sta_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u16 old_links, u16 new_links); int (*set_hw_timestamp)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_set_hw_timestamp *hwts); int (*net_setup_tc)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct net_device *dev, enum tc_setup_type type, void *type_data); enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); }; /** * ieee80211_alloc_hw_nm - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * @requested_name: Requested name for this device. * NULL is valid value, and means use the default naming (phy%d) * * Return: A pointer to the new hardware device, or %NULL on error. */ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, const struct ieee80211_ops *ops, const char *requested_name); /** * ieee80211_alloc_hw - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * * Return: A pointer to the new hardware device, or %NULL on error. */ static inline struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { return ieee80211_alloc_hw_nm(priv_data_len, ops, NULL); } /** * ieee80211_register_hw - Register hardware device * * You must call this function before any other functions in * mac80211. Note that before a hardware can be registered, you * need to fill the contained wiphy's information. * * @hw: the device to register as returned by ieee80211_alloc_hw() * * Return: 0 on success. An error code otherwise. */ int ieee80211_register_hw(struct ieee80211_hw *hw); /** * struct ieee80211_tpt_blink - throughput blink description * @throughput: throughput in Kbit/sec * @blink_time: blink time in milliseconds * (full cycle, ie. one off + one on period) */ struct ieee80211_tpt_blink { int throughput; int blink_time; }; /** * enum ieee80211_tpt_led_trigger_flags - throughput trigger flags * @IEEE80211_TPT_LEDTRIG_FL_RADIO: enable blinking with radio * @IEEE80211_TPT_LEDTRIG_FL_WORK: enable blinking when working * @IEEE80211_TPT_LEDTRIG_FL_CONNECTED: enable blinking when at least one * interface is connected in some way, including being an AP */ enum ieee80211_tpt_led_trigger_flags { IEEE80211_TPT_LEDTRIG_FL_RADIO = BIT(0), IEEE80211_TPT_LEDTRIG_FL_WORK = BIT(1), IEEE80211_TPT_LEDTRIG_FL_CONNECTED = BIT(2), }; #ifdef CONFIG_MAC80211_LEDS const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw); const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len); #endif /** * ieee80211_get_tx_led_name - get name of TX LED * * mac80211 creates a transmit LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_tx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_rx_led_name - get name of RX LED * * mac80211 creates a receive LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_rx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_assoc_led_name - get name of association LED * * mac80211 creates a association LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_assoc_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_radio_led_name - get name of radio LED * * mac80211 creates a radio change LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_radio_led_name(hw); #else return NULL; #endif } /** * ieee80211_create_tpt_led_trigger - create throughput LED trigger * @hw: the hardware to create the trigger for * @flags: trigger flags, see &enum ieee80211_tpt_led_trigger_flags * @blink_table: the blink table -- needs to be ordered by throughput * @blink_table_len: size of the blink table * * Return: %NULL (in case of error, or if no LED triggers are * configured) or the name of the new trigger. * * Note: This function must be called before ieee80211_register_hw(). */ static inline const char * ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_create_tpt_led_trigger(hw, flags, blink_table, blink_table_len); #else return NULL; #endif } /** * ieee80211_unregister_hw - Unregister a hardware device * * This function instructs mac80211 to free allocated resources * and unregister netdevices from the networking subsystem. * * @hw: the hardware to unregister */ void ieee80211_unregister_hw(struct ieee80211_hw *hw); /** * ieee80211_free_hw - free hardware descriptor * * This function frees everything that was allocated, including the * private data for the driver. You must call ieee80211_unregister_hw() * before calling this function. * * @hw: the hardware to free */ void ieee80211_free_hw(struct ieee80211_hw *hw); /** * ieee80211_restart_hw - restart hardware completely * * Call this function when the hardware was restarted for some reason * (hardware error, ...) and the driver is unable to restore its state * by itself. mac80211 assumes that at this point the driver/hardware * is completely uninitialised and stopped, it starts the process by * calling the ->start() operation. The driver will need to reset all * internal state that it has prior to calling this function. * * @hw: the hardware to restart */ void ieee80211_restart_hw(struct ieee80211_hw *hw); /** * ieee80211_rx_list - receive frame and store processed skbs in a list * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled and RCU read lock * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @list: the destination list */ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct list_head *list); /** * ieee80211_rx_napi - receive frame from NAPI context * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled. * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @napi: the NAPI context */ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct napi_struct *napi); /** * ieee80211_rx - receive frame * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * In process context use instead ieee80211_rx_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) { ieee80211_rx_napi(hw, NULL, skb, NULL); } /** * ieee80211_rx_irqsafe - receive frame * * Like ieee80211_rx() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not * be mixed for a single hardware.Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_rx_ni - receive frame (in process context) * * Like ieee80211_rx() but can be called in process context * (internally disables bottom halves). * * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may * not be mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_rx(hw, skb); local_bh_enable(); } /** * ieee80211_sta_ps_transition - PS transition for connected sta * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS * flag set, use this function to inform mac80211 about a connected station * entering/leaving PS mode. * * This function may not be called in IRQ context or with softirqs enabled. * * Calls to this function for a single hardware must be synchronized against * each other. * * @sta: currently connected sta * @start: start or stop PS * * Return: 0 on success. -EINVAL when the requested PS mode is already set. */ int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start); /** * ieee80211_sta_ps_transition_ni - PS transition for connected sta * (in process context) * * Like ieee80211_sta_ps_transition() but can be called in process context * (internally disables bottom halves). Concurrent call restriction still * applies. * * @sta: currently connected sta * @start: start or stop PS * * Return: Like ieee80211_sta_ps_transition(). */ static inline int ieee80211_sta_ps_transition_ni(struct ieee80211_sta *sta, bool start) { int ret; local_bh_disable(); ret = ieee80211_sta_ps_transition(sta, start); local_bh_enable(); return ret; } /** * ieee80211_sta_pspoll - PS-Poll frame received * @sta: currently connected station * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a PS-Poll frame from a * connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_uapsd_trigger(); calls to all three must * be serialized. */ void ieee80211_sta_pspoll(struct ieee80211_sta *sta); /** * ieee80211_sta_uapsd_trigger - (potential) U-APSD trigger frame received * @sta: currently connected station * @tid: TID of the received (potential) trigger frame * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a (potential) trigger frame * from a connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_pspoll(); calls to all three must be * serialized. * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown. * In this case, mac80211 will not check that this tid maps to an AC * that is trigger enabled and assume that the caller did the proper * checks. */ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid); /* * The TX headroom reserved by mac80211 for its own tx_status functions. * This is enough for the radiotap header. */ #define IEEE80211_TX_STATUS_HEADROOM ALIGN(14, 4) /** * ieee80211_sta_set_buffered - inform mac80211 about driver-buffered frames * @sta: &struct ieee80211_sta pointer for the sleeping station * @tid: the TID that has buffered frames * @buffered: indicates whether or not frames are buffered for this TID * * If a driver buffers frames for a powersave station instead of passing * them back to mac80211 for retransmission, the station may still need * to be told that there are buffered frames via the TIM bit. * * This function informs mac80211 whether or not there are frames that are * buffered in the driver for a given TID; mac80211 can then use this data * to set the TIM bit (NOTE: This may call back into the driver's set_tim * call! Beware of the locking!) * * If all frames are released to the station (due to PS-poll or uAPSD) * then the driver needs to inform mac80211 that there no longer are * frames buffered. However, when the station wakes up mac80211 assumes * that all buffered frames will be transmitted and clears this data, * drivers need to make sure they inform mac80211 about all buffered * frames on the sleep transition (sta_notify() with %STA_NOTIFY_SLEEP). * * Note that technically mac80211 only needs to know this per AC, not per * TID, but since driver buffering will inevitably happen per TID (since * it is related to aggregation) it is easier to make mac80211 map the * TID to the AC as required instead of keeping track in all drivers that * use this API. */ void ieee80211_sta_set_buffered(struct ieee80211_sta *sta, u8 tid, bool buffered); /** * ieee80211_get_tx_rates - get the selected transmit rates for a packet * * Call this function in a driver with per-packet rate selection support * to combine the rate info in the packet tx info with the most recent * rate selection table for the station entry. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: the receiver station to which this packet is sent. * @skb: the frame to be transmitted. * @dest: buffer for extracted rate/retry information * @max_rates: maximum number of rates to fetch */ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates); /** * ieee80211_sta_set_expected_throughput - set the expected tpt for a station * * Call this function to notify mac80211 about a change in expected throughput * to a station. A driver for a device that does rate control in firmware can * call this function when the expected throughput estimate towards a station * changes. The information is used to tune the CoDel AQM applied to traffic * going towards that station (which can otherwise be too aggressive and cause * slow stations to starve). * * @pubsta: the station to set throughput for. * @thr: the current expected throughput in kbps. */ void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr); /** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta * This function can be used in drivers that does not have provision * in updating the tx rate in data path. * * @hw: the hardware the frame was transmitted by * @pubsta: the station to update the tx rate for. * @info: tx status information */ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info); /** * ieee80211_tx_status_skb - transmit status callback * * Call this function for all transmitted frames after they have been * transmitted. It is permissible to not call this function for * multicast frames but this can affect statistics. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls * to this function, ieee80211_tx_status_ni() and ieee80211_tx_status_irqsafe() * may not be mixed for a single hardware. Must not run concurrently with * ieee80211_rx() or ieee80211_rx_ni(). * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_tx_status_ext - extended transmit status callback * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that may want to provide extra information that does not * fit into &struct ieee80211_tx_info. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @status: tx status information */ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status); /** * ieee80211_tx_status_noskb - transmit status callback without skb * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that cannot reliably map tx status information back to * specific skbs. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @sta: the receiver station to which this packet is sent * (NULL for multicast packets) * @info: tx status information */ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_tx_info *info) { struct ieee80211_tx_status status = { .sta = sta, .info = info, }; ieee80211_tx_status_ext(hw, &status); } /** * ieee80211_tx_status_ni - transmit status callback (in process context) * * Like ieee80211_tx_status_skb() but can be called in process context. * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_irqsafe() may not be mixed * for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_tx_status_skb(hw, skb); local_bh_enable(); } /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * * Like ieee80211_tx_status_skb() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_report_low_ack - report non-responding station * * When operating in AP-mode, call this function to report a non-responding * connected STA. * * @sta: the non-responding connected sta * @num_packets: number of packets sent to @sta without a response */ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); #define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2 /** * struct ieee80211_mutable_offsets - mutable beacon offsets * @tim_offset: position of TIM element * @tim_length: size of TIM element * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u16 mbssid_off; }; /** * ieee80211_beacon_get_template - beacon template generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon template. * * This function should be used if the beacon frames are generated by the * device, and then the driver must use the returned beacon as the template * The driver or the device are responsible to update the DTIM and, when * applicable, the CSA count. * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id); /** * ieee80211_beacon_get_template_ema_index - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP). * @ema_index: index of the beacon in the EMA set. * * This function follows the same rules as ieee80211_beacon_get_template() * but returns a beacon template which includes multiple BSSID element at the * requested index. * * Return: The beacon template. %NULL indicates the end of EMA templates. */ struct sk_buff * ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id, u8 ema_index); /** * struct ieee80211_ema_beacons - List of EMA beacons * @cnt: count of EMA beacons. * * @bcn: array of EMA beacons. * @bcn.skb: the skb containing this specific beacon * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. */ struct ieee80211_ema_beacons { u8 cnt; struct { struct sk_buff *skb; struct ieee80211_mutable_offsets offs; } bcn[]; }; /** * ieee80211_beacon_get_template_ema_list - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP) * * This function follows the same rules as ieee80211_beacon_get_template() * but allocates and returns a pointer to list of all beacon templates required * to cover all profiles in the multiple BSSID set. Each template includes only * one multiple BSSID element. * * Driver must call ieee80211_beacon_free_ema_list() to free the memory. * * Return: EMA beacon templates of type struct ieee80211_ema_beacons *. * %NULL on error. */ struct ieee80211_ema_beacons * ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_free_ema_list - free an EMA beacon template list * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers. * * This function will free a list previously acquired by calling * ieee80211_beacon_get_template_ema_list() */ void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons); /** * ieee80211_beacon_get_tim - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @tim_offset: pointer to variable that will receive the TIM IE offset. * Set to 0 if invalid (in non-AP modes). * @tim_length: pointer to variable that will receive the TIM IE length, * (including the ID and length bytes!). * Set to 0 if invalid (in non-AP modes). * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon frame. * * If the beacon frames are generated by the host system (i.e., not in * hardware/firmware), the driver uses this function to get each beacon * frame from mac80211 -- it is responsible for calling this function exactly * once before the beacon is needed (e.g. based on hardware interrupt). * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length, unsigned int link_id); /** * ieee80211_beacon_get - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * See ieee80211_beacon_get_tim(). * * Return: See ieee80211_beacon_get_tim(). */ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id) { return ieee80211_beacon_get_tim(hw, vif, NULL, NULL, link_id); } /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the * beacon frames are generated by the device, the driver should call this * function after each beacon transmission to sync mac80211's beacon countdown. * * Return: new countdown value */ u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @counter: the new value for the counter * * The beacon countdown can be changed by the device, this API should be * used by the device driver to update csa counter in mac80211. * * It should never be used together with ieee80211_beacon_update_cntdwn(), * as it will cause a race condition around the counter value. */ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * This function returns whether the countdown reached zero. */ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_color_change_finish - notify mac80211 about color change * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * After a color change announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the color can be changed */ void ieee80211_color_change_finish(struct ieee80211_vif *vif); /** * ieee80211_proberesp_get - retrieve a Probe Response template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Response template which can, for example, be uploaded to * hardware. The destination address should be set by the caller. * * Can only be called in AP mode. * * Return: The Probe Response template. %NULL on error. */ struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_pspoll_get - retrieve a PS Poll template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a PS Poll a template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * AID, BSSID and MAC address is used. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit. * * Return: The PS Poll template. %NULL on error. */ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: If the vif is an MLD, get a frame with the link addresses * for the given link ID. For a link_id < 0 you get a frame with * MLD addresses, however useful that might be. * @qos_ok: QoS NDP is acceptable to the caller, this should be set * if at all possible * * Creates a Nullfunc template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * BSSID and address is used. * * If @qos_ndp is set and the association is to an AP with QoS/WMM, the * returned packet will be QoS NDP. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. * * Return: The nullfunc template. %NULL on error. */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int link_id, bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @src_addr: source MAC address * @ssid: SSID buffer * @ssid_len: length of SSID * @tailroom: tailroom to reserve at end of SKB for IEs * * Creates a Probe Request template which can, for example, be uploaded to * hardware. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom); /** * ieee80211_rts_get - RTS frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the RTS. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @rts: The buffer where to store the RTS frame. * * If the RTS frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next RTS frame from the 802.11 code. The low-level is responsible * for calling this function before and RTS frame is needed. */ void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts); /** * ieee80211_rts_duration - Get the duration field for an RTS frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the RTS. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the RTS is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_ctstoself_get - CTS-to-self frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the CTS-to-self. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @cts: The buffer where to store the CTS-to-self frame. * * If the CTS-to-self frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next CTS-to-self frame from the 802.11 code. The low-level is responsible * for calling this function before and CTS-to-self frame is needed. */ void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts); /** * ieee80211_ctstoself_duration - Get the duration field for a CTS-to-self frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the CTS-to-self. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the CTS-to-self is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_generic_frame_duration - Calculate the duration field for a frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @band: the band to calculate the frame duration on * @frame_len: the length of the frame. * @rate: the rate at which the frame is going to be transmitted. * * Calculate the duration field of some generic frame, given its * length and transmission rate (in 100kbps). * * Return: The duration. */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate); /** * ieee80211_get_buffered_bc - accessing buffered broadcast and multicast frames * @hw: pointer as obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Function for accessing buffered broadcast and multicast frames. If * hardware/firmware does not implement buffering of broadcast/multicast * frames when power saving is used, 802.11 code buffers them in the host * memory. The low-level driver uses this function to fetch next buffered * frame. In most cases, this is used when generating beacon frame. * * Return: A pointer to the next buffered skb or NULL if no more buffered * frames are available. * * Note: buffered frames are returned only after DTIM beacon frame was * generated with ieee80211_beacon_get() and the low-level driver must thus * call ieee80211_beacon_get() first. ieee80211_get_buffered_bc() returns * NULL if the previous generated beacon was not DTIM, so the low-level driver * does not need to check for DTIM beacons separately and should be able to * use common code for all beacons. */ struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_tkip_p1k_iv - get a TKIP phase 1 key for IV32 * * This function returns the TKIP phase 1 key for the given IV32. * * @keyconf: the parameter passed with the set key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p1k - get a TKIP phase 1 key * * This function returns the TKIP phase 1 key for the IV32 taken * from the given packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32 value from that will be encrypted * with this P1K * @p1k: a buffer to which the key will be written, as 5 u16 values */ static inline void ieee80211_get_tkip_p1k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u16 *p1k) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control); u32 iv32 = get_unaligned_le32(&data[4]); ieee80211_get_tkip_p1k_iv(keyconf, iv32, p1k); } /** * ieee80211_get_tkip_rx_p1k - get a TKIP phase 1 key for RX * * This function returns the TKIP phase 1 key for the given IV32 * and transmitter address. * * @keyconf: the parameter passed with the set key * @ta: TA that will be used with the key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, const u8 *ta, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p2k - get a TKIP phase 2 key * * This function computes the TKIP RC4 key for the IV values * in the packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32/IV16 values from that will be * encrypted with this key * @p2k: a buffer to which the key will be written, 16 bytes */ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); /** * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos * * @pos: start of crypto header * @keyconf: the parameter passed with the set key * @pn: PN to add * * Returns: pointer to the octet following IVs (i.e. beginning of * the packet payload) * * This function writes the tkip IV value to pos (which should * point to the crypto header) */ u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); /** * ieee80211_get_key_rx_seq - get key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: buffer to receive the sequence data * * This function allows a driver to retrieve the current RX IV/PNs * for the given key. It must not be called if IV checking is done * by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_set_key_rx_seq - set key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: new sequence data * * This function allows a driver to set the current RX IV/PNs for the * given key. This is useful when resuming from WoWLAN sleep and GTK * rekey may have been done while suspended. It should not be called * if IV checking is done by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * * Context: Must be called with the wiphy mutex held. * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); /** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on * @keyconf: new key data * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * * The function returns the newly allocated key structure, which will * have similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * * Note that this function assumes the key isn't added to hardware * acceleration, so no TX will be done with the key. Since it's a GTK * on managed (station) networks, this is true anyway. If the driver * calls this function from the resume callback and subsequently uses * the return code 1 to reconfigure the device, this key will be part * of the reconfiguration. * * Note that the driver should also call ieee80211_set_key_rx_seq() * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, * then it will attempt to remove it during this call. In many cases * this isn't what you want, so call ieee80211_remove_key() first for * the key that's being replaced. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf, int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying * @vif: virtual interface the rekeying was done on * @bssid: The BSSID of the AP, for checking association * @replay_ctr: the new replay counter after GTK rekeying * @gfp: allocation flags */ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp); /** * ieee80211_key_mic_failure - increment MIC failure counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf); /** * ieee80211_key_replay - increment replay counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); /** * ieee80211_wake_queue - wake specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queue - stop specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_queue_stopped - test status of the queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); /** * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); /** * ieee80211_scan_completed - completed hardware scan * * When hardware scan offload is used (i.e. the hw_scan() callback is * assigned) this function needs to be called by the driver to notify * mac80211 that the scan finished. This function can be called from * any context, including hardirq context. * * @hw: the hardware that finished the scan * @info: information about the completed scan */ void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info); /** * ieee80211_sched_scan_results - got results from scheduled scan * * When a scheduled scan is running, this function needs to be called by the * driver whenever there are new scan results available. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_results(struct ieee80211_hw *hw); /** * ieee80211_sched_scan_stopped - inform that the scheduled scan has stopped * * When a scheduled scan is running, this function can be called by * the driver if it needs to stop the scan to perform another task. * Usual scenarios are drivers that cannot continue the scheduled scan * while associating, for instance. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw); /** * enum ieee80211_interface_iteration_flags - interface iteration flags * @IEEE80211_IFACE_ITER_NORMAL: Iterate over all interfaces that have * been added to the driver; However, note that during hardware * reconfiguration (after restart_hw) it will iterate over a new * interface and over all the existing interfaces even if they * haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_RESUME_ALL: During resume, iterate over all * interfaces, even if they haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_ACTIVE: Iterate only active interfaces (netdev is up). * @IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER: Skip any interfaces where SDATA * is not in the driver. This may fix crashes during firmware recovery * for instance. */ enum ieee80211_interface_iteration_flags { IEEE80211_IFACE_ITER_NORMAL = 0, IEEE80211_IFACE_ITER_RESUME_ALL = BIT(0), IEEE80211_IFACE_ITER_ACTIVE = BIT(1), IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER = BIT(2), }; /** * ieee80211_iterate_interfaces - iterate interfaces * * This function iterates over the interfaces associated with a given * hardware and calls the callback for them. This includes active as well as * inactive interfaces. This function allows the iterator function to sleep. * Will iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ void ieee80211_iterate_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function allows the iterator function to sleep, when the iterator * function is atomic @ieee80211_iterate_active_interfaces_atomic can * be used. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ static inline void ieee80211_iterate_active_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { ieee80211_iterate_interfaces(hw, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } /** * ieee80211_iterate_active_interfaces_atomic - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function requires the iterator callback function to be atomic, * if that is not desired, use @ieee80211_iterate_active_interfaces instead. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This version can only be used while holding the wiphy mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_stations_atomic - iterate stations * * This function iterates over all stations associated with a given * hardware that are currently uploaded to the driver and calls the callback * function for them. * This function requires the iterator callback function to be atomic, * * @hw: the hardware struct of which the interfaces should be iterated over * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); /** * ieee80211_queue_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to add work onto the mac80211 workqueue. * This helper ensures drivers are not queueing work when they should not be. * * @hw: the hardware struct for the interface we are adding work for * @work: the work we want to add onto the mac80211 workqueue */ void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work); /** * ieee80211_queue_delayed_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to queue delayed work onto the mac80211 * workqueue. * * @hw: the hardware struct for the interface we are adding work for * @dwork: delayable work to queue onto the mac80211 workqueue * @delay: number of jiffies to wait before queueing */ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay); /** * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * * This function allows low level driver to refresh tx agg session timer * to maintain BA session, the session level will still be managed by the * mac80211. * * Note: must be called in an RCU critical section. */ void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_start_tx_ba_session - Start a tx Block Ack session. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * @timeout: session timeout value (in TUs) * * Return: success if addBA request was sent, failure otherwise * * Although mac80211/low level driver/user space application can estimate * the need to start aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_start_tx_ba_session(struct ieee80211_sta *sta, u16 tid, u16 timeout); /** * ieee80211_start_tx_ba_cb_irqsafe - low level driver ready to aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session. It can be called * from any context. */ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_stop_tx_ba_session - Stop a Block Ack session. * @sta: the station whose BA session to stop * @tid: the TID to stop BA. * * Return: negative error if the TID is invalid, or no aggregation active * * Although mac80211/low level driver/user space application can estimate * the need to stop aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_stop_tx_ba_cb_irqsafe - low level driver ready to stop aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the desired TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session tear down. It * can be called from any context. */ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_find_sta - find a station * * @vif: virtual interface to look for station on * @addr: station's address * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. */ struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr); /** * ieee80211_find_sta_by_ifaddr - find a station on hardware * * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's address * @localaddr: local address (vif->sdata->vif.addr). Use NULL for 'any'. * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. * * NOTE: You may pass NULL for localaddr, but then you will just get * the first STA that matches the remote address 'addr'. * We can have multiple STA associated with multiple * logical stations (e.g. consider a station connecting to another * BSSID on the same AP hardware without disconnecting first). * In this case, the result of this method with localaddr NULL * is not reliable. * * DO NOT USE THIS FUNCTION with localaddr NULL if at all possible. */ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr); /** * ieee80211_find_sta_by_link_addrs - find STA by link addresses * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's link address * @localaddr: local link address, use %NULL for any (but avoid that) * @link_id: pointer to obtain the link ID if the STA is found, * may be %NULL if the link ID is not needed * * Obtain the STA by link address, must use RCU protection. */ struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id); /** * ieee80211_sta_block_awake - block station from waking up * @hw: the hardware * @pubsta: the station * @block: whether to block or unblock * * Some devices require that all frames that are on the queues * for a specific station that went to sleep are flushed before * a poll response or frames after the station woke up can be * delivered to that it. Note that such frames must be rejected * by the driver as filtered, with the appropriate status flag. * * This function allows implementing this mode in a race-free * manner. * * To do this, a driver must keep track of the number of frames * still enqueued for a specific station. If this number is not * zero when the station goes to sleep, the driver must call * this function to force mac80211 to consider the station to * be asleep regardless of the station's actual state. Once the * number of outstanding frames reaches zero, the driver must * call this function again to unblock the station. That will * cause mac80211 to be able to send ps-poll responses, and if * the station queried in the meantime then frames will also * be sent out as a result of this. Additionally, the driver * will be notified that the station woke up some time after * it is unblocked, regardless of whether the station actually * woke up while blocked or not. */ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block); /** * ieee80211_sta_eosp - notify mac80211 about end of SP * @pubsta: the station * * When a device transmits frames in a way that it can't tell * mac80211 in the TX status about the EOSP, it must clear the * %IEEE80211_TX_STATUS_EOSP bit and call this function instead. * This applies for PS-Poll as well as uAPSD. * * Note that just like with _tx_status() and _rx() drivers must * not mix calls to irqsafe/non-irqsafe versions, this function * must not be mixed with those either. Use the all irqsafe, or * all non-irqsafe, don't mix! * * NB: the _irqsafe version of this function doesn't exist, no * driver needs it right now. Don't call this function if * you'd need the _irqsafe version, look at the git history * and restore the _irqsafe version! */ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); /** * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP * @pubsta: the station * @tid: the tid of the NDP * * Sometimes the device understands that it needs to close * the Service Period unexpectedly. This can happen when * sending frames that are filling holes in the BA window. * In this case, the device can ask mac80211 to send a * Nullfunc frame with EOSP set. When that happens, the * driver must have called ieee80211_sta_set_buffered() to * let mac80211 know that there are no buffered frames any * more, otherwise mac80211 will get the more_data bit wrong. * The low level driver must have made sure that the frame * will be sent despite the station being in power-save. * Mac80211 won't call allow_buffered_frames(). * Note that calling this function, doesn't exempt the driver * from closing the EOSP properly, it will still have to call * ieee80211_sta_eosp when the NDP is sent. */ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); /** * ieee80211_sta_recalc_aggregates - recalculate aggregate data after a change * @pubsta: the station * * Call this function after changing a per-link aggregate data as referenced in * &struct ieee80211_sta_aggregates by accessing the agg field of * &struct ieee80211_link_sta. * * With non MLO the data in deflink will be referenced directly. In that case * there is no need to call this function. */ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta); /** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * * Register airtime usage for a given sta on a given tid. The driver must call * this function to notify mac80211 that a station used a certain amount of * airtime. This information will be used by the TXQ scheduler to schedule * stations in a way that ensures airtime fairness. * * The reported airtime should as a minimum include all time that is spent * transmitting to the remote station, including overhead and padding, but not * including time spent waiting for a TXOP. If the time is not reported by the * hardware it can in some cases be calculated from the rate and known frame * composition. When possible, the time should include any failed transmission * attempts. * * The driver can either call this function synchronously for every packet or * aggregate, or asynchronously as airtime usage information becomes available. * TX and RX airtime can be reported together, or separately by setting one of * them to 0. * * @pubsta: the station * @tid: the TID to register airtime for * @tx_airtime: airtime used during TX (in usec) * @rx_airtime: airtime used during RX (in usec) */ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime); /** * ieee80211_txq_airtime_check - check if a txq can send frame to device * * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Return true if the AQL's airtime limit has not been reached and the txq can * continue to send more packets to the device. Otherwise return false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * Context: Must be called with wiphy mutex held; can sleep. * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the * set_key callback. */ void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_keys_rcu - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. Note that due to locking reasons, keys of station * in removal process will be skipped. * * This function requires being called in an RCU critical section, * and thus iter must be atomic. */ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts * @hw: pointer obtained from ieee80211_alloc_hw(). * @iter: iterator function * @iter_data: data passed to iterator function * * Iterate all active channel contexts. This function is atomic and * doesn't acquire any locks internally that might be held in other * places while calling into the driver. * * The iterator will not find a context that's being added (during * the driver callback to add it) but will find it while it's being * removed. * * Note that during hardware restart, all contexts that existed * before the restart are considered already present so will be * found while iterating, whether they've been re-added already * or not. */ void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data), void *iter_data); /** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Request template which can, for example, be uploaded to * hardware. The template is filled with bssid, ssid and supported rate * information. This function must only be called from within the * .bss_info_changed callback function and only in managed mode. The function * is only useful when the interface is associated, otherwise it will return * %NULL. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_beacon_loss - inform hardware does not receive beacons * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER and * %IEEE80211_CONF_PS is set, the driver needs to inform whenever the * hardware is not receiving beacons with this function. */ void ieee80211_beacon_loss(struct ieee80211_vif *vif); /** * ieee80211_connection_loss - inform hardware has lost connection to the AP * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER, and * %IEEE80211_CONF_PS and %IEEE80211_HW_CONNECTION_MONITOR are set, the driver * needs to inform if the connection to the AP has been lost. * The function may also be called if the connection needs to be terminated * for some other reason, even if %IEEE80211_HW_CONNECTION_MONITOR isn't set. * * This function will cause immediate change to disassociated state, * without connection recovery attempts. */ void ieee80211_connection_loss(struct ieee80211_vif *vif); /** * ieee80211_disconnect - request disconnection * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @reconnect: immediate reconnect is desired * * Request disconnection from the current network and, if enabled, send a * hint to the higher layers that immediate reconnect is desired. */ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); /** * ieee80211_resume_disconnect - disconnect from AP after resume * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after resume. * Drivers can use this after WoWLAN if they know that the * connection cannot be kept up, for example because keys were * used while the device was asleep but the replay counters or * similar cannot be retrieved from the device during resume. * * Note that due to implementation issues, if the driver uses * the reconfiguration functionality during resume the interface * will still be added as associated first during resume and then * disconnect normally later. * * This function can only be called from the resume callback and * the driver must not be holding any of its own locks while it * calls this function, or at least not any locks it needs in the * key configuration paths (if it supports HW crypto). */ void ieee80211_resume_disconnect(struct ieee80211_vif *vif); /** * ieee80211_hw_restart_disconnect - disconnect from AP after * hardware restart * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after * hardware restart. */ void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); /** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @rssi_event: the RSSI trigger event type * @rssi_level: new RSSI level value or 0 if not available * @gfp: context flags * * When the %IEEE80211_VIF_SUPPORTS_CQM_RSSI is set, and a connection quality * monitoring is configured with an rssi threshold, the driver will inform * whenever the rssi level reaches the threshold. */ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp); /** * ieee80211_cqm_beacon_loss_notify - inform CQM of beacon loss * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @gfp: context flags */ void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); /** * ieee80211_radar_detected - inform that a radar was detected * * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_radar_detected(struct ieee80211_hw *hw); /** * ieee80211_chswitch_done - Complete channel switch process * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @success: make the channel switch successful or not * @link_id: the link_id on which the switch was done. Ignored if success is * false. * * Complete the channel switch post-process: set the new operational channel * and wake up the suspended queues. */ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id); /** * ieee80211_channel_switch_disconnect - disconnect due to channel switch error * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @block_tx: if %true, do not send deauth frame. * * Instruct mac80211 to disconnect due to a channel switch error. The channel * switch can request to block the tx and so, we need to make sure we do not send * a deauth frame in this case. */ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_tx); /** * ieee80211_request_smps - request SM PS transition * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: link ID for MLO, or 0 * @smps_mode: new SM PS mode * * This allows the driver to request an SM PS transition in managed * mode. This is useful when the driver has more information than * the stack about possible interference, for example by bluetooth. */ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode); /** * ieee80211_ready_on_channel - notification of remain-on-channel start * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_ready_on_channel(struct ieee80211_hw *hw); /** * ieee80211_remain_on_channel_expired - remain_on_channel duration expired * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw); /** * ieee80211_stop_rx_ba_session - callback to stop existing BA sessions * * in order not to harm the system performance and user experience, the device * may request not to allow any rx ba session and tear down existing rx ba * sessions based on system constraints such as periodic BT activity that needs * to limit wlan activity (eg.sco or a2dp)." * in such cases, the intention is to limit the duration of the rx ppdu and * therefore prevent the peer device to use a-mpdu aggregation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ba_rx_bitmap: Bit map of open rx ba per tid * @addr: & to bssid mac address */ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, const u8 *addr); /** * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered * @pubsta: station struct * @tid: the session's TID * @ssn: starting sequence number of the bitmap, all frames before this are * assumed to be out of the window after the call * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc. * @received_mpdus: number of received mpdus in firmware * * This function moves the BA window and releases all frames before @ssn, and * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, u16 received_mpdus); /** * ieee80211_send_bar - send a BlockAckReq frame * * can be used to flush pending frames from the peer's aggregation reorder * buffer. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ra: the peer's destination address * @tid: the TID of the aggregation session * @ssn: the new starting sequence number for the receiver */ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); /** * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /** * ieee80211_start_rx_ba_session_offl - start a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Create structures responsible for reordering so device drivers may call here * when they complete AddBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid); } /** * ieee80211_stop_rx_ba_session_offl - stop a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Destroy structures responsible for reordering so device drivers may call here * when they complete DelBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS); } /** * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout * * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx * buffer reording internally, and therefore also handle the session timer. * * Trigger the timeout flow, which sends a DelBa. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /* Rate control API */ /** * struct ieee80211_tx_rate_control - rate control information for/from RC algo * * @hw: The hardware the algorithm is invoked for. * @sband: The band this frame is being transmitted on. * @bss_conf: the current BSS configuration * @skb: the skb that will be transmitted, the control information in it needs * to be filled in * @reported_rate: The rate control algorithm can fill this in to indicate * which rate should be reported to userspace as the current rate and * used for rate calculations in the mesh network. * @rts: whether RTS will be used for this frame because it is longer than the * RTS threshold * @short_preamble: whether mac80211 will request short-preamble transmission * if the selected rate supports it * @rate_idx_mask: user-requested (legacy) rate mask * @rate_idx_mcs_mask: user-requested MCS rate mask (NULL if not in use) * @bss: whether this frame is sent out in AP or IBSS mode */ struct ieee80211_tx_rate_control { struct ieee80211_hw *hw; struct ieee80211_supported_band *sband; struct ieee80211_bss_conf *bss_conf; struct sk_buff *skb; struct ieee80211_tx_rate reported_rate; bool rts, short_preamble; u32 rate_idx_mask; u8 *rate_idx_mcs_mask; bool bss; }; /** * enum rate_control_capabilities - rate control capabilities */ enum rate_control_capabilities { /** * @RATE_CTRL_CAPA_VHT_EXT_NSS_BW: * Support for extended NSS BW support (dot11VHTExtendedNSSCapable) * Note that this is only looked at if the minimum number of chains * that the AP uses is < the number of TX chains the hardware has, * otherwise the NSS difference doesn't bother us. */ RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0), /** * @RATE_CTRL_CAPA_AMPDU_TRIGGER: * mac80211 should start A-MPDU sessions on tx */ RATE_CTRL_CAPA_AMPDU_TRIGGER = BIT(1), }; struct rate_control_ops { unsigned long capa; const char *name; void *(*alloc)(struct ieee80211_hw *hw); void (*add_debugfs)(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir); void (*free)(void *priv); void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp); void (*rate_init)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta); void (*rate_update)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed); void (*free_sta)(void *priv, struct ieee80211_sta *sta, void *priv_sta); void (*tx_status_ext)(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st); void (*tx_status)(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, void *priv_sta, struct sk_buff *skb); void (*get_rate)(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc); void (*add_sta_debugfs)(void *priv, void *priv_sta, struct dentry *dir); u32 (*get_expected_throughput)(void *priv_sta); }; static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 rate_lowest_index(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return i; /* warn when we cannot find a rate. */ WARN_ON_ONCE(1); /* and return 0 (the lowest index) */ return 0; } static inline bool rate_usable_index_exists(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { unsigned int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return true; return false; } /** * rate_control_set_rates - pass the sta rate selection to mac80211/driver * * When not doing a rate control probe to test rates, rate control should pass * its rate selection to mac80211. If the driver supports receiving a station * rate table, it will use it to ensure that frames are always sent based on * the most recent rate control module decision. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates); int ieee80211_rate_control_register(const struct rate_control_ops *ops); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops); static inline bool conf_is_ht20(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_20; } static inline bool conf_is_ht40_minus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 < conf->chandef.chan->center_freq; } static inline bool conf_is_ht40_plus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 > conf->chandef.chan->center_freq; } static inline bool conf_is_ht40(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40; } static inline bool conf_is_ht(struct ieee80211_conf *conf) { return (conf->chandef.width != NL80211_CHAN_WIDTH_5) && (conf->chandef.width != NL80211_CHAN_WIDTH_10) && (conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT); } static inline enum nl80211_iftype ieee80211_iftype_p2p(enum nl80211_iftype type, bool p2p) { if (p2p) { switch (type) { case NL80211_IFTYPE_STATION: return NL80211_IFTYPE_P2P_CLIENT; case NL80211_IFTYPE_AP: return NL80211_IFTYPE_P2P_GO; default: break; } } return type; } static inline enum nl80211_iftype ieee80211_vif_type_p2p(struct ieee80211_vif *vif) { return ieee80211_iftype_p2p(vif->type, vif->p2p); } /** * ieee80211_get_he_iftype_cap_vif - return HE capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_he_cap, or %NULL is none found */ static inline const struct ieee80211_sta_he_cap * ieee80211_get_he_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_he_6ghz_capa_vif - return HE 6 GHz capabilities * @sband: the sband to search for the STA on * @vif: the vif to get the iftype from * * Return: the 6GHz capabilities */ static inline __le16 ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_6ghz_capa(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_eht_cap, or %NULL is none found */ static inline const struct ieee80211_sta_eht_cap * ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_eht_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data * * @vif: the specified virtual interface * @link_id: the link ID for MLO, otherwise 0 * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group * * Note: This function assumes that the given vif is valid and the position and * membership data is of the correct size and are in the same byte order as the * matching GroupId management frame. * Calls to this function need to be serialized with RX path. */ void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position); void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); /** * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ int ieee80211_ave_rssi(struct ieee80211_vif *vif); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup * @vif: virtual interface * @wakeup: wakeup reason(s) * @gfp: allocation flags * * See cfg80211_report_wowlan_wakeup(). */ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, struct cfg80211_wowlan_wakeup *wakeup, gfp_t gfp); /** * ieee80211_tx_prepare_skb - prepare an 802.11 skb for transmission * @hw: pointer as obtained from ieee80211_alloc_hw() * @vif: virtual interface * @skb: frame to be sent from within the driver * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta); /** * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header * of injected frames. * * To accurately parse and take into account rate and retransmission fields, * you must initialize the chandef field in the ieee80211_tx_info structure * of the skb before calling this function. * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); /** * struct ieee80211_noa_data - holds temporary data for tracking P2P NoA state * * @next_tsf: TSF timestamp of the next absent state change * @has_next_tsf: next absent state change event pending * * @absent: descriptor bitmask, set if GO is currently absent * * private: * * @count: count fields from the NoA descriptors * @desc: adjusted data from the NoA */ struct ieee80211_noa_data { u32 next_tsf; bool has_next_tsf; u8 absent; u8 count[IEEE80211_P2P_NOA_DESC_MAX]; struct { u32 start; u32 duration; u32 interval; } desc[IEEE80211_P2P_NOA_DESC_MAX]; }; /** * ieee80211_parse_p2p_noa - initialize NoA tracking data from P2P IE * * @attr: P2P NoA IE * @data: NoA tracking data * @tsf: current TSF timestamp * * Return: number of successfully parsed descriptors */ int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_update_p2p_noa - get next pending P2P GO absent state change * * @data: NoA tracking data * @tsf: current TSF timestamp */ void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_tdls_oper_request - request userspace to perform a TDLS operation * @vif: virtual interface * @peer: the peer's destination address * @oper: the requested TDLS operation * @reason_code: reason code for the operation, valid for TDLS teardown * @gfp: allocation flags * * See cfg80211_tdls_oper_request(). */ void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); /** * ieee80211_reserve_tid - request to reserve a specific TID * * There is sometimes a need (such as in TDLS) for blocking the driver from * using a specific TID so that the FW can use it for certain operations such * as sending PTI requests. To make sure that the driver doesn't use that TID, * this function must be called as it flushes out packets on this TID and marks * it as blocked, so that any transmit for the station on this TID will be * redirected to the alternative TID in the same AC. * * Note that this function blocks and may call back into the driver, so it * should be called without driver locks held. Also note this function should * only be called from the driver's @sta_state callback. * * @sta: the station to reserve the TID for * @tid: the TID to reserve * * Returns: 0 on success, else on failure */ int ieee80211_reserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_unreserve_tid - request to unreserve a specific TID * * Once there is no longer any need for reserving a certain TID, this function * should be called, and no longer will packets have their TID modified for * preventing use of this TID in the driver. * * Note that this function blocks and acquires a lock, so it should be called * without driver locks held. Also note this function should only be called * from the driver's @sta_state callback. * * @sta: the station * @tid: the TID to unreserve */ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_tx_dequeue - dequeue a packet from a software tx queue * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * * Returns the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in * skb->cb, e.g. the key pointer, are protected by RCU and thus the * critical section must persist not just for the duration of this call * but for the duration of the frame handling. * However, also note that while in the wake_tx_queue() method, * rcu_read_lock() is already held. * * softirqs must also be disabled when this function is called. * In process context, use ieee80211_tx_dequeue_ni() instead. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_tx_dequeue_ni - dequeue a packet from a software tx queue * (in process context) * * Like ieee80211_tx_dequeue() but can be called in process context * (internally disables bottom halves). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct sk_buff *skb; local_bh_disable(); skb = ieee80211_tx_dequeue(hw, txq); local_bh_enable(); return skb; } /** * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback * * @hw: pointer as obtained from wake_tx_queue() callback(). * @txq: pointer as obtained from wake_tx_queue() callback(). * * Drivers can use this function for the mandatory mac80211 wake_tx_queue * callback in struct ieee80211_ops. They should not call this function. */ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_next_txq - get next tx queue to pull packets from * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * * Returns the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac); /** * ieee80211_txq_schedule_start - start new scheduling round for TXQs * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to acquire locks for * * Should be called before ieee80211_next_txq() or ieee80211_return_txq(). * The driver must not call multiple TXQ scheduling rounds concurrently. */ void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac); /* (deprecated) */ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) { } void __ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force); /** * ieee80211_schedule_txq - schedule a TXQ for transmission * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Schedules a TXQ for transmission if it is not already scheduled, * even if mac80211 does not have any packets buffered. * * The driver may call this function if it has buffered packets for * this TXQ internally. */ static inline void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { __ieee80211_schedule_txq(hw, txq, true); } /** * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq() * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * @force: schedule txq even if mac80211 does not have any buffered packets. * * The driver may set force=true if it has buffered packets for this TXQ * internally. */ static inline void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { __ieee80211_schedule_txq(hw, txq, force); } /** * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit * * This function is used to check whether given txq is allowed to transmit by * the airtime scheduler, and can be used by drivers to access the airtime * fairness accounting without using the scheduling order enforced by * next_txq(). * * Returns %true if the airtime scheduler thinks the TXQ should be allowed to * transmit, and %false if it should be throttled. This function can also have * the side effect of rotating the TXQ in the scheduler rotation, which will * eventually bring the deficit to positive and allow the station to transmit * again. * * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be * aligned against driver's own round-robin scheduler list. i.e it rotates * the TXQ list till it makes the requested node becomes the first entry * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this * function returns %true, the driver is expected to schedule packets * for transmission, and then return the TXQ through ieee80211_return_txq(). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_txq_get_depth - get pending frame/byte count of given txq * * The values are not guaranteed to be coherent with regard to each other, i.e. * txq state can change half-way of this function and the caller may end up * with "new" frame_cnt and "old" byte_cnt or vice-versa. * * @txq: pointer obtained from station or virtual interface * @frame_cnt: pointer to store frame count * @byte_cnt: pointer to store byte count */ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt); /** * ieee80211_nan_func_terminated - notify about NAN function termination. * * This function is used to notify mac80211 about NAN function termination. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @inst_id: the local instance id * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*) * @gfp: allocation flags */ void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp); /** * ieee80211_nan_func_match - notify about NAN function match event. * * This function is used to notify mac80211 about NAN function match. The * cookie inside the match struct will be assigned by mac80211. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @match: match event information * @gfp: allocation flags */ void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp); /** * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the RX status struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, int len); /** * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the TX info struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading. * * This function is used to notify mac80211 that a vif can be passed raw 802.3 * frames. The driver needs to then handle the 802.11 encapsulation inside the * hardware or firmware. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @enable: indicate if the feature should be turned on or off */ bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); /** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: FILS discovery template. %NULL on error. */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast * probe response template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: Unsolicited broadcast probe response template. %NULL on error. */ struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color * collision. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is * aware of. */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap); /** * ieee80211_is_tx_data - check if frame is a data frame * * The function is used to check if a frame is a data frame. Frames with * hardware encapsulation enabled are data frames. * * @skb: the frame to be transmitted. */ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; return info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP || ieee80211_is_data(hdr->frame_control); } /** * ieee80211_set_active_links - set active links in client mode * @vif: interface to set active links on * @active_links: the new active links bitmap * * Context: Must be called with wiphy mutex held; may sleep; calls * back into the driver. * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. * * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); /** * ieee80211_set_active_links_async - asynchronously set active links * @vif: interface to set active links on * @active_links: the new active links bitmap * * See ieee80211_set_active_links() for more information, the only * difference here is that the link change is triggered async and * can be called in any context, but the link switch will only be * completed after it returns. */ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); #endif /* MAC80211_H */
15 16 6 1 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only /* iptables module for using new netfilter netlink queue * * (C) 2005 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> #include <net/netfilter/nf_queue.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFQUEUE"); MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static u32 jhash_initval __read_mostly; static unsigned int nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } static unsigned int nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; if (info->queues_total > 1) { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } return NF_QUEUE_NR(queue); } static unsigned int nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v2 *info = par->targinfo; unsigned int ret = nfqueue_tg_v1(skb, par); if (info->bypass) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static int nfqueue_tg_check(const struct xt_tgchk_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; init_hashrandom(&jhash_initval); if (info->queues_total == 0) { pr_info_ratelimited("number of total queues is 0\n"); return -EINVAL; } maxid = info->queues_total - 1 + info->queuenum; if (maxid > 0xffff) { pr_info_ratelimited("number of queues (%u) out of range (got %u)\n", info->queues_total, maxid); return -ERANGE; } if (par->target->revision == 2 && info->flags > 1) return -EINVAL; if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) return -EINVAL; return 0; } static unsigned int nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 queue = info->queuenum; int ret; if (info->queues_total > 1) { if (info->flags & NFQ_FLAG_CPU_FANOUT) { int cpu = smp_processor_id(); queue = info->queuenum + cpu % info->queues_total; } else { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } } ret = NF_QUEUE_NR(queue); if (info->flags & NFQ_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", .family = NFPROTO_UNSPEC, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v1, .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 2, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v2, .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 3, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v3, .targetsize = sizeof(struct xt_NFQ_info_v3), .me = THIS_MODULE, }, }; static int __init nfqueue_tg_init(void) { return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } static void __exit nfqueue_tg_exit(void) { xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } module_init(nfqueue_tg_init); module_exit(nfqueue_tg_exit);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com> * Copyright 2001 Andi Kleen, SuSE Labs. * * Much of the core SMP work is based on previous work by Thomas Radke, to * whom a great many thanks are extended. * * Thanks to Intel for making available several different Pentium, * Pentium Pro and Pentium-II/Xeon MP machines. * Original development of Linux SMP code supported by Caldera. * * Fixes * Felix Koop : NR_CPUS used properly * Jose Renau : Handle single CPU case. * Alan Cox : By repeated request 8) - Total BogoMIPS report. * Greg Wright : Fix for kernel stacks panic. * Erich Boleyn : MP v1.4 and additional changes. * Matthias Sattler : Changes for 2.1 kernel map. * Michel Lespinasse : Changes for 2.1 kernel map. * Michael Chastain : Change trampoline.S to gnu as. * Alan Cox : Dumb bug: 'B' step PPro's are fine * Ingo Molnar : Added APIC timers, based on code * from Jose Renau * Ingo Molnar : various cleanups and rewrites * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. * Maciej W. Rozycki : Bits for genuine 82489DX APICs * Andi Kleen : Changed for SMP boot into long mode. * Martin J. Bligh : Added support for multi-quad systems * Dave Jones : Report invalid combinations of Athlon CPUs. * Rusty Russell : Hacked into shape for new "hotplug" boot process. * Andi Kleen : Converted to new state machine. * Ashok Raj : CPU hotplug support * Glauber Costa : i386 and x86_64 integration */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/smp.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/hotplug.h> #include <linux/sched/task_stack.h> #include <linux/percpu.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/kexec.h> #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> #include <linux/stackprotector.h> #include <linux/cpuhotplug.h> #include <linux/mc146818rtc.h> #include <asm/acpi.h> #include <asm/cacheinfo.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <asm/microcode.h> #include <asm/i8259.h> #include <asm/misc.h> #include <asm/qspinlock.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> #include <asm/stackprotector.h> #include <asm/sev.h> #include <asm/spec-ctrl.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* representing HT, core, and die siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); /* CPUs which are the primary SMT threads */ struct cpumask __cpu_primary_thread_mask __read_mostly; /* Representing CPUs for which sibling maps can be computed */ static cpumask_var_t cpu_sibling_setup_mask; struct mwait_cpu_dead { unsigned int control; unsigned int status; }; #define CPUDEAD_MWAIT_WAIT 0xDEADBEEF #define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD /* * Cache line aligned data for mwait_play_dead(). Separate on purpose so * that it's unlikely to be touched by other CPUs. */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; int arch_update_cpu_topology(void) { int retval = x86_topology_update; x86_topology_update = false; return retval; } static unsigned int smpboot_warm_reset_vector_count; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); if (!smpboot_warm_reset_vector_count++) { CMOS_WRITE(0xa, 0xf); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; } spin_unlock_irqrestore(&rtc_lock, flags); } static inline void smpboot_restore_warm_reset_vector(void) { unsigned long flags; /* * Paranoid: Set warm reset code and vector here back * to default values. */ spin_lock_irqsave(&rtc_lock, flags); if (!--smpboot_warm_reset_vector_count) { CMOS_WRITE(0, 0xf); *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } spin_unlock_irqrestore(&rtc_lock, flags); } /* Run the next set of setup steps for the upcoming CPU */ static void ap_starting(void) { int cpuid = smp_processor_id(); /* Mop up eventual mwait_play_dead() wreckage */ this_cpu_write(mwait_cpu_dead.status, 0); this_cpu_write(mwait_cpu_dead.control, 0); /* * If woken up by an INIT in an 82489DX configuration the alive * synchronization guarantees that the CPU does not reach this * point before an INIT_deassert IPI reaches the local APIC, so it * is now safe to touch the local APIC. * * Set up this CPU, first the APIC, which is probably redundant on * most boards. */ apic_ap_setup(); /* Save the processor parameters. */ smp_store_cpu_info(cpuid); /* * The topology information must be up to date before * notify_cpu_starting(). */ set_cpu_sibling_map(cpuid); ap_init_aperfmperf(); pr_debug("Stack at about %p\n", &cpuid); wmb(); /* * This runs the AP through all the cpuhp states to its target * state CPUHP_ONLINE. */ notify_cpu_starting(cpuid); } static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. * smp_store_cpu_info() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, * calibrate_delay_is_known() will skip the calibration routine * when TSC is synchronized across sockets. */ calibrate_delay(); cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy; } /* * Activate a secondary processor. */ static void notrace start_secondary(void *unused) { /* * Don't put *anything* except direct CPU state initialization * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ cr4_init(); /* * 32-bit specific. 64-bit reaches this code with the correct page * table established. Yet another historical divergence. */ if (IS_ENABLED(CONFIG_X86_32)) { /* switch away from the initial page table */ load_cr3(swapper_pg_dir); __flush_tlb_all(); } cpu_init_exception_handling(); /* * Load the microcode before reaching the AP alive synchronization * point below so it is not part of the full per CPU serialized * bringup part when "parallel" bringup is enabled. * * That's even safe when hyperthreading is enabled in the CPU as * the core code starts the primary threads first and leaves the * secondary threads waiting for SIPI. Loading microcode on * physical cores concurrently is a safe operation. * * This covers both the Intel specific issue that concurrent * microcode loading on SMT siblings must be prohibited and the * vendor independent issue`that microcode loading which changes * CPUID, MSRs etc. must be strictly serialized to maintain * software state correctness. */ load_ucode_ap(); /* * Synchronization point with the hotplug core. Sets this CPUs * synchronization state to ALIVE and spin-waits for the control CPU to * release this CPU for further bringup. */ cpuhp_ap_sync_alive(); cpu_init(); fpu__init_cpu(); rcutree_report_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); ap_starting(); /* Check TSC synchronization with the control CPU. */ check_tsc_sync_target(); /* * Calibrate the delay loop after the TSC synchronization check. * This allows to skip the calibration when TSC is synchronized * across sockets. */ ap_calibrate_delay(); speculative_store_bypass_ht_init(); /* * Lock vector_lock, set CPU online and bring the vector * allocator online. Online must be set with vector_lock held * to prevent a concurrent irq setup/teardown from seeing a * half valid vector space. */ lock_vector_lock(); set_cpu_online(smp_processor_id(), true); lapic_online(); unlock_vector_lock(); x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void __init smp_store_boot_cpu_info(void) { struct cpuinfo_x86 *c = &cpu_data(0); *c = boot_cpu_data; c->initialized = true; } /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); /* Copy boot_cpu_data only on the first bringup */ if (!c->initialized) *c = boot_cpu_data; c->cpu_index = id; /* * During boot time, CPU0 has this setup already. Save the info when * bringing up an AP. */ identify_secondary_cpu(c); c->initialized = true; } static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); } static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; return !WARN_ONCE(!topology_same_node(c, o), "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " "[node: %d != %d]. Ignoring dependency.\n", cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); } #define link_mask(mfunc, c1, c2) \ do { \ cpumask_set_cpu((c1), mfunc(c2)); \ cpumask_set_cpu((c2), mfunc(c1)); \ } while (0) static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->topo.pkg_id == o->topo.pkg_id && c->topo.die_id == o->topo.die_id && c->topo.amd_node_id == o->topo.amd_node_id && per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); if ((c->topo.cu_id != 0xff) && (o->topo.cu_id != 0xff) && (c->topo.cu_id == o->topo.cu_id)) return topology_sane(c, o, "smt"); } } else if (c->topo.pkg_id == o->topo.pkg_id && c->topo.die_id == o->topo.die_id && c->topo.core_id == o->topo.core_id) { return topology_sane(c, o, "smt"); } return false; } static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id) return false; if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1) return c->topo.amd_node_id == o->topo.amd_node_id; return true; } static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; /* If the arch didn't set up l2c_id, fall back to SMT */ if (per_cpu_l2c_id(cpu1) == BAD_APICID) return match_smt(c, o); /* Do not match if L2 cache id does not match: */ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2)) return false; return topology_sane(c, o, "l2c"); } /* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later. */ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->topo.pkg_id == o->topo.pkg_id) return true; return false; } /* * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. * * Any Intel CPU that has multiple nodes per package and does not * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. * * When in SNC mode, these CPUs enumerate an LLC that is shared * by multiple NUMA nodes. The LLC is shared for off-package data * access but private to the NUMA node (half of the package) for * on-package access. CPUID (the source of the information about * the LLC) can only enumerate the cache as shared or unshared, * but not this particular configuration. */ static const struct x86_cpu_id intel_cod_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ {} }; static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); int cpu1 = c->cpu_index, cpu2 = o->cpu_index; bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ if (per_cpu_llc_id(cpu1) == BAD_APICID) return false; /* Do not match if LLC id does not match: */ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2)) return false; /* * Allow the SNC topology without warning. Return of false * means 'c' does not share the LLC of 'o'. This will be * reflected to userspace. */ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) return false; return topology_sane(c, o, "llc"); } static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; } #ifdef CONFIG_SCHED_MC static int x86_core_flags(void) { return cpu_core_flags() | x86_sched_itmt_flags(); } #endif #ifdef CONFIG_SCHED_SMT static int x86_smt_flags(void) { return cpu_smt_flags(); } #endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { return cpu_cluster_flags() | x86_sched_itmt_flags(); } #endif static int x86_die_flags(void) { if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) return x86_sched_itmt_flags(); return 0; } /* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel * Sub-NUMA Clustering have this. */ static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[6]; static void __init build_sched_topology(void) { int i = 0; #ifdef CONFIG_SCHED_SMT x86_topology[i++] = (struct sched_domain_topology_level){ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }; #endif #ifdef CONFIG_SCHED_CLUSTER x86_topology[i++] = (struct sched_domain_topology_level){ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }; #endif #ifdef CONFIG_SCHED_MC x86_topology[i++] = (struct sched_domain_topology_level){ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }; #endif /* * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) }; } /* * There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); set_sched_topology(x86_topology); } void set_cpu_sibling_map(int cpu) { bool has_smt = __max_threads_per_core > 1; bool has_mp = has_smt || topology_num_cores_per_package() > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; int i, threads; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; return; } for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); if ((i == cpu) || (has_mp && match_l2c(c, o))) link_mask(cpu_l2c_shared_mask, cpu, i); if ((i == cpu) || (has_mp && match_die(c, o))) link_mask(topology_die_cpumask, cpu, i); } threads = cpumask_weight(topology_sibling_cpumask(cpu)); if (threads > __max_smt_threads) __max_smt_threads = threads; for_each_cpu(i, topology_sibling_cpumask(cpu)) cpu_data(i).smt_active = threads > 1; /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. */ for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); if ((i == cpu) || (has_mp && match_pkg(c, o))) { link_mask(topology_core_cpumask, cpu, i); /* * Does this new cpu bringup a new core? */ if (threads == 1) { /* * for each core in package, increment * the booted_cores for this new cpu */ if (cpumask_first( topology_sibling_cpumask(i)) == i) c->booted_cores++; /* * increment the core count for all * the other cpus in this package */ if (i != cpu) cpu_data(i).booted_cores++; } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } } } /* maps the cpu to the sched domain representing multi-core */ const struct cpumask *cpu_coregroup_mask(int cpu) { return cpu_llc_shared_mask(cpu); } const struct cpumask *cpu_clustergroup_mask(int cpu) { return cpu_l2c_shared_mask(cpu); } EXPORT_SYMBOL_GPL(cpu_clustergroup_mask); static void impress_friends(void) { int cpu; unsigned long bogosum = 0; /* * Allow the user to impress friends. */ pr_debug("Before bogomips\n"); for_each_online_cpu(cpu) bogosum += cpu_data(cpu).loops_per_jiffy; pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); pr_debug("Before bogocount - setting activated=1\n"); } /* * The Multiprocessor Specification 1.4 (1997) example code suggests * that there should be a 10ms delay between the BSP asserting INIT * and de-asserting INIT, when starting a remote processor. * But that slows boot and resume on modern processors, which include * many cores and don't require that delay. * * Cmdline "init_cpu_udelay=" is available to over-ride this delay. * Modern processor families are quirked to remove the delay entirely. */ #define UDELAY_10MS_DEFAULT 10000 static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { get_option(&str, &init_udelay); return 0; } early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; return; } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } /* * Wake up AP by INIT, INIT, STARTUP sequence. */ static void send_init_sequence(u32 phys_apicid) { int maxlvt = lapic_get_maxlvt(); /* Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) { /* Due to the Pentium erratum 3AP. */ if (maxlvt > 3) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } /* Assert INIT on the target CPU */ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); safe_apic_wait_icr_idle(); udelay(init_udelay); /* Deassert INIT on the target CPU */ apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); safe_apic_wait_icr_idle(); } /* * Wake up AP by INIT, INIT, STARTUP sequence. */ static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; int num_starts, j, maxlvt; preempt_disable(); maxlvt = lapic_get_maxlvt(); send_init_sequence(phys_apicid); mb(); /* * Should we send STARTUP IPIs ? * * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; /* * Run STARTUP IPI loop. */ pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); pr_debug("After apic_write\n"); /* * STARTUP IPI */ /* Target chip */ /* Boot on the stack */ /* Kick the second */ apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); /* * Give the other CPU some time to accept the IPI. */ if (init_udelay == 0) udelay(10); else udelay(300); pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. */ if (init_udelay == 0) udelay(10); else udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; } pr_debug("After Startup\n"); if (send_status) pr_err("APIC never delivered???\n"); if (accept_status) pr_err("APIC delivery error (%lx)\n", accept_status); preempt_enable(); return (send_status | accept_status); } /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { static int width, node_width, first = 1; static int current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu); if (!width) width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ if (!node_width) node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ if (system_state < SYSTEM_RUNNING) { if (first) pr_info("x86: Booting SMP configuration:\n"); if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); current_node = node; printk(KERN_INFO ".... node %*s#%d, CPUs: ", node_width - num_digits(node), " ", node); } /* Add padding for the BSP */ if (first) pr_cont("%*s", width + 1, " "); first = 0; pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", node, cpu, apicid); } int common_cpu_up(unsigned int cpu, struct task_struct *idle) { int ret; /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ ret = irq_init_percpu_irqstack(cpu); if (ret) return ret; #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #endif return 0; } /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) { unsigned long start_ip = real_mode_header->trampoline_start; int ret; #ifdef CONFIG_X86_64 /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ if (apic->wakeup_secondary_cpu_64) start_ip = real_mode_header->trampoline_start64; #endif idle->thread.sp = (unsigned long)task_pt_regs(idle); initial_code = (unsigned long)start_secondary; if (IS_ENABLED(CONFIG_X86_32)) { early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_stack = idle->thread.sp; } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { smpboot_control = cpu; } /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); /* So we see what's up */ announce_cpu(cpu, apicid); /* * This grunge runs the startup process for * the targeted processor. */ if (x86_platform.legacy.warm_reset) { pr_debug("Setting warm reset code and vector.\n"); smpboot_setup_warm_reset_vector(start_ip); /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } smp_mb(); /* * Wake up a CPU in difference cases: * - Use a method from the APIC driver if one defined, with wakeup * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message */ if (apic->wakeup_secondary_cpu_64) ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); else if (apic->wakeup_secondary_cpu) ret = apic->wakeup_secondary_cpu(apicid, start_ip); else ret = wakeup_secondary_cpu_via_init(apicid, start_ip); /* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret) arch_cpuhp_cleanup_kick_cpu(cpu); return ret; } int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { u32 apicid = apic->cpu_present_to_apicid(cpu); int err; lockdep_assert_irqs_enabled(); pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || !apic_id_valid(apicid)) { pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); return -EINVAL; } if (!test_bit(apicid, phys_cpu_present_map)) { pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); return -EINVAL; } /* * Save current MTRR state in case it was changed since early boot * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: */ mtrr_save_state(); /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; err = common_cpu_up(cpu, tidle); if (err) return err; err = do_boot_cpu(apicid, cpu, tidle); if (err) pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return err; } int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) { return smp_ops.kick_ap_alive(cpu, tidle); } void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { /* Cleanup possible dangling ends... */ if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset) smpboot_restore_warm_reset_vector(); } void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { if (smp_ops.cleanup_dead_cpu) smp_ops.cleanup_dead_cpu(cpu); if (system_state == SYSTEM_RUNNING) pr_info("CPU %u is now offline\n", cpu); } void arch_cpuhp_sync_state_poll(void) { if (smp_ops.poll_sync_state) smp_ops.poll_sync_state(); } /** * arch_disable_smp_support() - Disables SMP support for x86 at boottime */ void __init arch_disable_smp_support(void) { disable_ioapic_support(); } /* * Fall back to non SMP mode after errors. * * RED-PEN audit/test this more. I bet there is more state messed up here. */ static __init void disable_smp(void) { pr_info("SMP disabled\n"); disable_ioapic_support(); topology_reset_possible_cpus_up(); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); cpumask_set_cpu(0, topology_die_cpumask(0)); } static void __init smp_cpu_index_default(void) { int i; struct cpuinfo_x86 *c; for_each_possible_cpu(i) { c = &cpu_data(i); /* mark all to hotplug */ c->cpu_index = nr_cpu_ids; } } void __init smp_prepare_cpus_common(void) { unsigned int i; smp_cpu_index_default(); /* * Setup boot CPU information */ smp_store_boot_cpu_info(); /* Final full version of the data */ mb(); for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); } void __init smp_prepare_boot_cpu(void) { smp_ops.smp_prepare_boot_cpu(); } #ifdef CONFIG_X86_64 /* Establish whether parallel bringup can be supported. */ bool __init arch_cpuhp_init_parallel_bringup(void) { if (!x86_cpuinit.parallel_bringup) { pr_info("Parallel CPU startup disabled by the platform\n"); return false; } smpboot_control = STARTUP_READ_APICID; pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control); return true; } #endif /* * Prepare for SMP bootup. * @max_cpus: configured maximum number of CPUs, It is a legacy parameter * for common interface support. */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); switch (apic_intr_mode) { case APIC_PIC: case APIC_VIRTUAL_WIRE_NO_CONFIG: disable_smp(); return; case APIC_SYMMETRIC_IO_NO_ROUTING: disable_smp(); /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); return; case APIC_VIRTUAL_WIRE: case APIC_SYMMETRIC_IO: break; } /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); uv_system_init(); smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) { set_cache_aps_delayed_init(true); } void arch_thaw_secondary_cpus_end(void) { cache_aps_init(); } /* * Early setup to make printk work. */ void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); /* SMP handles this from setup_per_cpu_areas() */ if (!IS_ENABLED(CONFIG_SMP)) switch_gdt_and_percpu_base(me); native_pv_lock_init(); } void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); } /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } #ifdef CONFIG_HOTPLUG_CPU /* Recompute SMT state for all CPUs on offline */ static void recompute_smt_state(void) { int max_threads, cpu; max_threads = 0; for_each_online_cpu (cpu) { int threads = cpumask_weight(topology_sibling_cpumask(cpu)); if (threads > max_threads) max_threads = threads; } __max_smt_threads = max_threads; } static void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); for_each_cpu(sibling, topology_core_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); /*/ * last thread sibling in this cpu core going down */ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1) cpu_data(sibling).booted_cores--; } for_each_cpu(sibling, topology_die_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) cpu_data(sibling).smt_active = false; } for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); c->topo.core_id = 0; c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); } static void remove_cpu_from_maps(int cpu) { set_cpu_online(cpu, false); numa_remove_cpu(cpu); } void cpu_disable_common(void) { int cpu = smp_processor_id(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); lapic_offline(); } int native_cpu_disable(void) { int ret; ret = lapic_can_unplug_cpu(); if (ret) return ret; cpu_disable_common(); /* * Disable the local APIC. Otherwise IPI broadcasts will reach * it. It still responds normally to INIT, NMI, SMI, and SIPI * messages. * * Disabling the APIC must happen after cpu_disable_common() * which invokes fixup_irqs(). * * Disabling the APIC preserves already set bits in IRR, but * an interrupt arriving after disabling the local APIC does not * set the corresponding IRR bit. * * fixup_irqs() scans IRR for set bits so it can raise a not * yet handled interrupt on the new destination CPU via an IPI * but obviously it can't do so for IRR bits which are not set. * IOW, interrupts arriving after disabling the local APIC will * be lost. */ apic_soft_disable(); return 0; } void play_dead_common(void) { idle_task_exit(); cpuhp_ap_report_dead(); local_irq_disable(); } /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ static inline void mwait_play_dead(void) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; int i; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) return; if (!this_cpu_has(X86_FEATURE_MWAIT)) return; if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; eax = CPUID_MWAIT_LEAF; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); /* * eax will be 0 if EDX enumeration is not valid. * Initialized below to cstate, sub_cstate value when EDX is valid. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { eax = 0; } else { edx >>= MWAIT_SUBSTATE_SIZE; for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { if (edx & MWAIT_SUBSTATE_MASK) { highest_cstate = i; highest_subcstate = edx & MWAIT_SUBSTATE_MASK; } } eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | (highest_subcstate - 1); } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; md->control = CPUDEAD_MWAIT_WAIT; wbinvd(); while (1) { /* * The CLFLUSH is a workaround for erratum AAI65 for * the Xeon 7400 series. It's not clear it is actually * needed, but it should be harmless in either case. * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. */ mb(); clflush(md); mb(); __monitor(md, 0, 0); mb(); __mwait(eax, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* * Kexec is about to happen. Don't go back into mwait() as * the kexec kernel might overwrite text and data including * page tables and stack. So mwait() would resume when the * monitor cache line is written to and then the CPU goes * south due to overwritten text, page tables and stack. * * Note: This does _NOT_ protect against a stray MCE, NMI, * SMI. They will resume execution at the instruction * following the HLT instruction and run into the problem * which this is trying to prevent. */ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT); while(1) native_halt(); } } } /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). */ void smp_kick_mwait_play_dead(void) { u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT; struct mwait_cpu_dead *md; unsigned int cpu, i; for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) { md = per_cpu_ptr(&mwait_cpu_dead, cpu); /* Does it sit in mwait_play_dead() ? */ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT) continue; /* Wait up to 5ms */ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) { /* Bring it out of mwait */ WRITE_ONCE(md->control, newstate); udelay(5); } if (READ_ONCE(md->status) != newstate) pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu); } } void __noreturn hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); while (1) native_halt(); } /* * native_play_dead() is essentially a __noreturn function, but it can't * be marked as such as the compiler may complain about it. */ void native_play_dead(void) { if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) __update_spec_ctrl(0); play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); mwait_play_dead(); if (cpuidle_play_dead()) hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ int native_cpu_disable(void) { return -ENOSYS; } void native_play_dead(void) { BUG(); } #endif
1071 1061 1066 1077 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2019 Facebook * Copyright 2020 Google LLC. */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); static struct bpf_local_storage __rcu ** inode_storage_ptr(void *owner) { struct inode *inode = owner; struct bpf_storage_blob *bsb; bsb = bpf_inode(inode); if (!bsb) return NULL; return &bsb->storage; } static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *inode_storage; struct bpf_local_storage_map *smap; struct bpf_storage_blob *bsb; bsb = bpf_inode(inode); if (!bsb) return NULL; inode_storage = rcu_dereference_check(bsb->storage, bpf_rcu_lock_held()); if (!inode_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); } void bpf_inode_storage_free(struct inode *inode) { struct bpf_local_storage *local_storage; struct bpf_storage_blob *bsb; bsb = bpf_inode(inode); if (!bsb) return; rcu_read_lock(); local_storage = rcu_dereference(bsb->storage); if (!local_storage) { rcu_read_unlock(); return; } bpf_local_storage_destroy(local_storage); rcu_read_unlock(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct fd f = fdget_raw(*(int *)key); if (!f.file) return ERR_PTR(-EBADF); sdata = inode_storage_lookup(file_inode(f.file), map, true); fdput(f); return sdata ? sdata->data : NULL; } static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct fd f = fdget_raw(*(int *)key); if (!f.file) return -EBADF; if (!inode_storage_ptr(file_inode(f.file))) { fdput(f); return -EBADF; } sdata = bpf_local_storage_update(file_inode(f.file), (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); fdput(f); return PTR_ERR_OR_ZERO(sdata); } static int inode_storage_delete(struct inode *inode, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = inode_storage_lookup(inode, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { struct fd f = fdget_raw(*(int *)key); int err; if (!f.file) return -EBADF; err = inode_storage_delete(file_inode(f.file), map); fdput(f); return err; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; /* explicitly check that the inode_storage_ptr is not * NULL as inode_storage_lookup returns NULL in this case and * bpf_local_storage_update expects the owner to have a * valid storage pointer. */ if (!inode || !inode_storage_ptr(inode)) return (unsigned long)NULL; sdata = inode_storage_lookup(inode, map, true); if (sdata) return (unsigned long)sdata->data; /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } return (unsigned long)NULL; } BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!inode) return -EINVAL; /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ return inode_storage_delete(inode, map); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &inode_cache, false); } static void inode_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &inode_cache, NULL); } const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = inode_storage_map_alloc, .map_free = inode_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) const struct bpf_func_proto bpf_inode_storage_get_proto = { .func = bpf_inode_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_inode_storage_delete_proto = { .func = bpf_inode_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], };
10 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(&copy_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { if (copy_mc_fragile_enabled) return copy_mc_fragile(dst, src, len); if (static_cpu_has(X86_FEATURE_ERMS)) return copy_mc_enhanced_fast_string(dst, src, len); memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); }
126 51 1 126 126 123 44 33 122 123 123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include "internal.h" static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif #ifdef CONFIG_UTS_NS &utsns_operations, #endif #ifdef CONFIG_IPC_NS &ipcns_operations, #endif #ifdef CONFIG_PID_NS &pidns_operations, &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, #endif &mntns_operations, #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif #ifdef CONFIG_TIME_NS &timens_operations, &timens_for_children_operations, #endif }; static const char *proc_ns_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) return ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; error = ns_get_path(&ns_path, task, ns_ops); if (error) goto out; error = nd_jump_link(&ns_path); out: put_task_struct(task); return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) return res; if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); } put_task_struct(task); return res; } static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, .get_link = proc_ns_get_link, .setattr = proc_setattr, }; static struct dentry *proc_ns_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations **entry, **last; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) goto out; entry = ns_entries + (ctx->pos - 2); last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; while (entry <= last) { const struct proc_ns_operations *ops = *entry; if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), proc_ns_instantiate, task, ops)) break; ctx->pos++; entry++; } out: put_task_struct(task); return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_ns_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } if (entry == last) goto out; res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: return res; } const struct inode_operations proc_ns_dir_inode_operations = { .lookup = proc_ns_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, };
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "hci_request.h" #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; __u8 data[]; }; struct msft_cp_le_monitor_advertisement { __u8 sub_opcode; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { __u8 status; __u8 sub_opcode; __u8 handle; } __packed; #define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 struct msft_cp_le_cancel_monitor_advertisement { __u8 sub_opcode; __u8 handle; } __packed; struct msft_rp_le_cancel_monitor_advertisement { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 struct msft_cp_le_set_advertisement_filter_enable { __u8 sub_opcode; __u8 enable; } __packed; struct msft_rp_le_set_advertisement_filter_enable { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_EV_LE_MONITOR_DEVICE 0x02 struct msft_ev_le_monitor_device { __u8 addr_type; bdaddr_t bdaddr; __u8 monitor_handle; __u8 monitor_state; } __packed; struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; struct list_head list; }; enum monitor_addr_filter_state { AF_STATE_IDLE, AF_STATE_ADDING, AF_STATE_ADDED, AF_STATE_REMOVING, }; #define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 struct msft_monitor_addr_filter_data { __u8 msft_handle; __u8 pattern_handle; /* address filters pertain to */ __u16 mgmt_handle; int state; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 addr_type; bdaddr_t bdaddr; struct list_head list; }; struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; /* To synchronize add/remove address filter and monitor device event.*/ struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) { return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); } static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { struct msft_cp_read_supported_features cp; struct msft_rp_read_supported_features *rp; struct sk_buff *skb; cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; } if (skb->len < sizeof(*rp)) { bt_dev_err(hdev, "MSFT supported features length mismatch"); goto failed; } rp = (struct msft_rp_read_supported_features *)skb->data; if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) goto failed; if (rp->evt_prefix_len > 0) { msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, GFP_KERNEL); if (!msft->evt_prefix) goto failed; } msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) hdev->msft_curve_validity = true; kfree_skb(skb); return true; failed: kfree_skb(skb); return false; } /* is_mgmt = true matches the handle exposed to userspace via mgmt. * is_mgmt = false matches the handle used by the msft controller. * This function requires the caller holds hdev->lock */ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data (struct hci_dev *hdev, u16 handle, bool is_mgmt) { struct msft_monitor_advertisement_handle_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->handle_map, list) { if (is_mgmt && entry->mgmt_handle == handle) return entry; if (!is_mgmt && entry->msft_handle == handle) return entry; } return NULL; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_find_address_data (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, u8 pattern_handle) { struct msft_monitor_addr_filter_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->address_filters, list) { if (entry->pattern_handle == pattern_handle && addr_type == entry->addr_type && !bacmp(addr, &entry->bdaddr)) return entry; } return NULL; } /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, bool notify) { struct monitored_device *dev, *tmp; int count = 0; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { /* mgmt_handle == 0 indicates remove all devices, whereas, * bdaddr == NULL indicates remove all devices matching the * mgmt_handle. */ if ((!mgmt_handle || dev->handle == mgmt_handle) && (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && addr_type == dev->addr_type))) { if (notify && dev->notified) { mgmt_adv_monitor_device_lost(hdev, dev->handle, &dev->bdaddr, dev->addr_type); } list_del(&dev->list); kfree(dev); count++; } } return count; } static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; hci_dev_lock(hdev); rp = (struct msft_rp_le_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } status = rp->status; if (status) goto unlock; handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: if (status) hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); return status; } /* This function requires the caller holds hci_req_sync_lock */ static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) { struct msft_monitor_addr_filter_data *address_filter, *n; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct list_head head; struct sk_buff *skb; INIT_LIST_HEAD(&head); /* Cancel all corresponding address monitors */ mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { if (address_filter->pattern_handle != handle) continue; list_del(&address_filter->list); /* Keep the address filter and let * msft_add_address_filter_sync() remove and free the address * filter. */ if (address_filter->state == AF_STATE_ADDING) { address_filter->state = AF_STATE_REMOVING; continue; } /* Keep the address filter and let * msft_cancel_address_filter_sync() remove and free the address * filter */ if (address_filter->state == AF_STATE_REMOVING) continue; list_add_tail(&address_filter->list, &head); } mutex_unlock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &head, list) { list_del(&address_filter->list); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { kfree(address_filter); continue; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); kfree(address_filter); } } static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_cancel_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto done; } status = rp->status; if (status) goto done; hci_dev_lock(hdev); handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (handle_data) { if (monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ if (!msft->suspending) { hci_free_adv_monitor(hdev, monitor); /* Clear any monitored devices by this Adv Monitor */ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL, 0, false); } msft_handle = handle_data->msft_handle; list_del(&handle_data->list); kfree(handle_data); hci_dev_unlock(hdev); msft_remove_addr_filters_sync(hdev, msft_handle); } else { hci_dev_unlock(hdev); } done: return status; } /* This function requires the caller holds hci_req_sync_lock */ static int msft_remove_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; struct sk_buff *skb; handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ if (!handle_data) return -ENOENT; cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = handle_data->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) return PTR_ERR(skb); return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); } /* This function requires the caller holds hci_req_sync_lock */ int msft_suspend_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct adv_monitor *monitor; int handle = 0; if (!msft || !msft_monitor_supported(hdev)) return 0; msft->suspending = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_remove_monitor_sync(hdev, monitor); handle++; } /* All monitors have been removed */ msft->suspending = false; return 0; } static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) return false; /* High_threshold_timeout is not supported, * once high_threshold is reached, events are immediately reported. */ if (r->high_threshold_timeout != 0) return false; if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) return false; /* Sampling period from 0x00 to 0xFF are all allowed */ return true; } static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) { return msft_monitor_rssi_valid(monitor); /* No additional check needed for pattern-based monitor */ } static int msft_add_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; list_for_each_entry(entry, &monitor->patterns, list) { pattern_count++; total_size += sizeof(*pattern) + entry->length; } cp = kmalloc(total_size, GFP_KERNEL); if (!cp) return -ENOMEM; cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = monitor->rssi.high_threshold; cp->rssi_low = monitor->rssi.low_threshold; cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; cp->rssi_sampling_period = monitor->rssi.sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; pattern_data = (void *)cp->data; pattern_data->count = pattern_count; list_for_each_entry(entry, &monitor->patterns, list) { pattern = (void *)(pattern_data->data + offset); /* the length also includes data_type and offset */ pattern->length = entry->length + 2; pattern->data_type = entry->ad_type; pattern->start_byte = entry->offset; memcpy(pattern->pattern, entry->value, entry->length); offset += sizeof(*pattern) + entry->length; } skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto out_free; } err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); if (err) goto out_free; handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (!handle_data) { err = -ENODATA; goto out_free; } handle_data->rssi_high = cp->rssi_high; handle_data->rssi_low = cp->rssi_low; handle_data->rssi_low_interval = cp->rssi_low_interval; handle_data->rssi_sampling_period = cp->rssi_sampling_period; out_free: kfree(cp); return err; } /* This function requires the caller holds hci_req_sync_lock */ static void reregister_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; int handle = 0; if (!msft) return; msft->resuming = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_add_monitor_sync(hdev, monitor); handle++; } /* All monitors have been reregistered */ msft->resuming = false; } /* This function requires the caller holds hci_req_sync_lock */ int msft_resume_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft || !msft_monitor_supported(hdev)) return 0; hci_dev_lock(hdev); /* Clear already tracked devices on resume. Once the monitors are * reregistered, devices in range will be found again after resume. */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); reregister_monitor(hdev); return 0; } /* This function requires the caller holds hci_req_sync_lock */ void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; if (!msft) { bt_dev_err(hdev, "MSFT extension not registered"); return; } bt_dev_dbg(hdev, "Initialize MSFT extension"); /* Reset existing MSFT data before re-reading */ kfree(msft->evt_prefix); msft->evt_prefix = NULL; msft->evt_prefix_len = 0; msft->features = 0; if (!read_supported_features(hdev, msft)) { hdev->msft_data = NULL; kfree(msft); return; } if (msft_monitor_supported(hdev)) { msft->resuming = true; msft_set_filter_enable(hdev, true); /* Monitors get removed on power off, so we need to explicitly * tell the controller to re-monitor. */ reregister_monitor(hdev); } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) return; bt_dev_dbg(hdev, "Cleanup of MSFT extension"); /* The controller will silently remove all monitors on power off. * Therefore, remove handle_data mapping and reset monitor state. */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; list_del(&handle_data->list); kfree(handle_data); } mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { list_del(&address_filter->list); kfree(address_filter); } mutex_unlock(&msft->filter_lock); hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); } static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb; int err = 0; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); err = PTR_ERR(skb); goto done; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); done: kfree(address_filter); return err; } void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; bt_dev_dbg(hdev, "Register MSFT extension"); msft = kzalloc(sizeof(*msft), GFP_KERNEL); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; } INIT_LIST_HEAD(&msft->handle_map); INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; mutex_init(&msft->filter_lock); } void msft_unregister(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft) return; bt_dev_dbg(hdev, "Unregister MSFT extension"); hdev->msft_data = NULL; kfree(msft->evt_prefix); mutex_destroy(&msft->filter_lock); kfree(msft); } /* This function requires the caller holds hdev->lock */ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { struct monitored_device *dev; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); return; } bacpy(&dev->bdaddr, bdaddr); dev->addr_type = addr_type; dev->handle = mgmt_handle; dev->notified = false; INIT_LIST_HEAD(&dev->list); list_add(&dev->list, &hdev->monitored_devices); hdev->advmon_pend_notify = true; } /* This function requires the caller holds hdev->lock */ static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, true)) { bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", MSFT_EV_LE_MONITOR_DEVICE, bdaddr); } } static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, u8 ev, size_t len) { void *data; data = skb_pull_data(skb, len); if (!data) bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); return data; } static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_rp_le_monitor_advertisement *rp; struct msft_cp_le_monitor_advertisement *cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb = NULL; bool remove = false; size_t size; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return -ENODEV; /* We are safe to use the address filter from now on. * msft_monitor_device_evt() wouldn't delete this filter because it's * not been added by now. * And all other functions that requiring hci_req_sync_lock wouldn't * touch this filter before this func completes because it's protected * by hci_req_sync_lock. */ if (address_filter->state == AF_STATE_REMOVING) { mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); kfree(address_filter); return 0; } size = sizeof(*cp) + sizeof(address_filter->addr_type) + sizeof(address_filter->bdaddr); cp = kzalloc(size, GFP_KERNEL); if (!cp) { bt_dev_err(hdev, "MSFT: Alloc cmd param err"); remove = true; goto done; } cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = address_filter->rssi_high; cp->rssi_low = address_filter->rssi_low; cp->rssi_low_interval = address_filter->rssi_low_interval; cp->rssi_sampling_period = address_filter->rssi_sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; cp->data[0] = address_filter->addr_type; memcpy(&cp->data[1], &address_filter->bdaddr, sizeof(address_filter->bdaddr)); skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, HCI_CMD_TIMEOUT); kfree(cp); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to enable address %pMR filter", &address_filter->bdaddr); skb = NULL; remove = true; goto done; } rp = skb_pull_data(skb, sizeof(*rp)); if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || rp->status) remove = true; done: mutex_lock(&msft->filter_lock); if (remove) { bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", &address_filter->bdaddr); list_del(&address_filter->list); kfree(address_filter); } else { address_filter->state = AF_STATE_ADDED; address_filter->msft_handle = rp->handle; bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", &address_filter->bdaddr); } mutex_unlock(&msft->filter_lock); kfree_skb(skb); return 0; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_add_address_filter (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, struct msft_monitor_advertisement_handle_data *handle_data) { struct msft_monitor_addr_filter_data *address_filter = NULL; struct msft_data *msft = hdev->msft_data; int err; address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); if (!address_filter) return NULL; address_filter->state = AF_STATE_ADDING; address_filter->msft_handle = 0xff; address_filter->pattern_handle = handle_data->msft_handle; address_filter->mgmt_handle = handle_data->mgmt_handle; address_filter->rssi_high = handle_data->rssi_high; address_filter->rssi_low = handle_data->rssi_low; address_filter->rssi_low_interval = handle_data->rssi_low_interval; address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; address_filter->addr_type = addr_type; bacpy(&address_filter->bdaddr, bdaddr); /* With the above AF_STATE_ADDING, duplicated address filter can be * avoided when receiving monitor device event (found/lost) frequently * for the same device. */ list_add_tail(&address_filter->list, &msft->address_filters); err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, address_filter, NULL); if (err < 0) { bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); list_del(&address_filter->list); kfree(address_filter); return NULL; } bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", &address_filter->bdaddr); return address_filter; } /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); if (!ev) return; bt_dev_dbg(hdev, "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; goto report_state; } if (handle_data) { /* Don't report any device found/lost event from pattern * monitors. Pattern monitor always has its address filters for * tracking devices. */ address_filter = msft_find_address_data(hdev, ev->addr_type, &ev->bdaddr, handle_data->msft_handle); if (address_filter) return; if (ev->monitor_state && handle_data->cond_type == MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) msft_add_address_filter(hdev, ev->addr_type, &ev->bdaddr, handle_data); return; } /* This device event is not from pattern monitor. * Report it if there is a corresponding address_filter for it. */ list_for_each_entry(n, &msft->address_filters, list) { if (n->state == AF_STATE_ADDED && n->msft_handle == ev->monitor_handle) { mgmt_handle = n->mgmt_handle; address_filter = n; break; } } if (!address_filter) { bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", &ev->bdaddr, ev->monitor_handle, ev->monitor_state); return; } report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; break; case ADDR_LE_DEV_RANDOM: addr_type = BDADDR_LE_RANDOM; break; default: bt_dev_err(hdev, "MSFT vendor event 0x%02x: unknown addr type 0x%02x", MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); return; } if (ev->monitor_state) { msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); } else { if (address_filter && address_filter->state == AF_STATE_ADDED) { address_filter->state = AF_STATE_REMOVING; hci_cmd_sync_queue(hdev, msft_cancel_address_filter_sync, address_filter, NULL); } msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; u8 *evt_prefix; u8 *evt; if (!msft) return; /* When the extension has defined an event prefix, check that it * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); if (!evt_prefix) return; if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; } /* Every event starts at least with an event code and the rest of * the data is variable and depends on the event code. */ if (skb->len < 1) return; evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); if (!evt) return; hci_dev_lock(hdev); switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); mutex_unlock(&msft->filter_lock); break; default: bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); break; } hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; return msft ? msft->features : 0; } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, void *user_data, u8 status) { struct msft_cp_le_set_advertisement_filter_enable *cp = user_data; struct msft_data *msft = hdev->msft_data; /* Error 0x0C would be returned if the filter enabled status is * already set to whatever we were trying to set. * Although the default state should be disabled, some controller set * the initial value to enabled. Because there is no way to know the * actual initial value before sending this command, here we also treat * error 0x0C as success. */ if (status != 0x00 && status != 0x0C) return; hci_dev_lock(hdev); msft->filter_enabled = cp->enable; if (status == 0x0C) bt_dev_warn(hdev, "MSFT filter_enable is already %s", cp->enable ? "on" : "off"); hci_dev_unlock(hdev); } /* This function requires the caller holds hci_req_sync_lock */ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_add_monitor_sync(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_remove_monitor_sync(hdev, monitor); } int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { struct msft_cp_le_set_advertisement_filter_enable cp; struct msft_data *msft = hdev->msft_data; int err; if (!msft) return -EOPNOTSUPP; cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; cp.enable = enable; err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err); return 0; } bool msft_curve_validity(struct hci_dev *hdev) { return hdev->msft_curve_validity; }
11 12 11 1994 1990 67 1934 1962 1953 11 11 10 1427 549 548 548 548 549 42 41 81 82 81 81 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/memblock.h> /* max_low_pfn */ #include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <linux/mm.h> /* find_and_lock_vma() */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> #include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static bool is_amd_k8_pre_npt(void) { struct cpuinfo_x86 *c = &boot_cpu_data; return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && c->x86_vendor == X86_VENDOR_AMD && c->x86 == 0xf && c->x86_model < 0x40); } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* Erratum #91 affects AMD K8, pre-NPT CPUs */ if (!is_amd_k8_pre_npt()) return 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; /* * This code has historically always bailed out if IP points to a * not-present page (e.g. due to a race). No one has ever * complained about this. */ pagefault_disable(); while (instr < max_instr) { unsigned char opcode; if (user_mode(regs)) { if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) break; } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } pagefault_enable(); return prefetch; } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_p4d/set_pud. */ p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); if (!p4d_present(*p4d_k)) return NULL; pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } /* * Handle a fault on the vmalloc or module mapping area * * This is needed because there is a race condition between the time * when the vmalloc mapping code updates the PMD to the point in time * where it synchronizes this update with the other page-tables in the * system. * * In this race window another thread/CPU can map an area on the same * PMD, finds it already present and does not synchronize it with the * rest of the system yet. As a result v[mz]alloc might return areas * which are not mapped in every page-table in the system, causing an * unhandled page-fault when they are accessed. */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start & PMD_MASK; addr >= TASK_SIZE_MAX && addr < VMALLOC_END; addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #define pr_pde pr_cont #else #define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); #undef pr_pde /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: pr_cont("\n"); } #else /* CONFIG_X86_64: */ #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif static int bad_address(void *p) { unsigned long dummy; return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (bad_address(p4d)) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; pr_cont("PTE %lx", pte_val(*pte)); out: pr_cont("\n"); return; bad: pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (user_mode(regs)) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } /* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } #endif return 0; } static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) { u32 offset = (index >> 3) * sizeof(struct desc_struct); unsigned long addr; struct ldttss_desc desc; if (index == 0) { pr_alert("%s: NULL\n", name); return; } if (offset + sizeof(struct ldttss_desc) >= gdt->size) { pr_alert("%s: 0x%hx -- out of bounds\n", name, index); return; } if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); return; } addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); #ifdef CONFIG_X86_64 addr |= ((u64)desc.base3 << 32); #endif pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); } if (address < PAGE_SIZE && !user_mode(regs)) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : "read access", user_mode(regs) ? "user" : "kernel"); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps * surprisingly, if the CPU tries to deliver a benign or * contributory exception from user code and gets a page fault * during delivery, the page fault can be delivered as though * it originated directly from user code. This could happen * due to wrong permissions on the IDT, GDT, LDT, TSS, or * kernel or IST stack. */ store_idt(&idt); /* Usable even on Xen PV -- it's just slow. */ native_store_gdt(&gdt); pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", idt.address, idt.size, gdt.address, gdt.size); store_ldt(ldtr); show_ldttss(&gdt, "LDTR", ldtr); store_tr(tr); show_ldttss(&gdt, "TR", tr); } dump_pagetable(address); if (error_code & X86_PF_RMP) snp_dump_hva_rmpentry(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static void sanitize_error_code(unsigned long address, unsigned long *error_code) { /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. * * NB: This means that failed vsyscalls with vsyscall=none * will have the PROT bit. This doesn't leak any * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) *error_code |= X86_PF_PROT; } static void set_signal_archinfo(unsigned long address, unsigned long error_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; } static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_VMAP_STACK struct stack_info info; #endif unsigned long flags; int sig; if (user_mode(regs)) { /* * Implicit kernel access from user mode? Skip the stack * overflow and EFI special cases. */ goto oops; } #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), handle_stack_overflow, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); unreachable(); } #endif /* * Buggy firmware could access regions which might page fault. If * this happens, EFI has a special OOPS path that will try to * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code, u32 pkey) { WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from * task context. */ if (in_interrupt()) return; /* * Per the above we're !in_interrupt(), aka. task context. * * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { sanitize_error_code(address, &error_code); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) { force_sig_pkuerr((void __user *)address, pkey); } else { /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address); } } /* * Barring that, we can do the fixup and be happy. */ return; } /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH * instruction. */ if (is_prefetch(regs, error_code, address)) return; page_fault_oops(regs, error_code, address); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; /* This is a racy snapshot, but it's better than nothing. */ int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); /* * Dump the likely CPU where the fatal segfault happened. * This can help identify faulty hardware. */ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, topology_core_id(cpu), topology_physical_package_id(cpu)); printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, si_code, pkey); return; } if (!(error_code & X86_PF_USER)) { /* Implicit user access to kernel memory -- just oops */ page_fault_oops(regs, error_code, address); return; } /* * User mode accesses just cause a SIGSEGV. * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct mm_struct *mm = current->mm; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ mmap_read_unlock(mm); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { /* This code is always called on the current mm */ bool foreign = false; if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return true; return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma) { /* * This OSPKE check is not strictly necessary at runtime. * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ if (bad_area_access_from_pkeys(error_code, vma)) { /* * A protection key fault means that the PKRU value did not allow * access to some PTE. Userspace can figure out what PKRU was * from the XSAVE state. This function captures the pkey from * the vma and passes it to userspace so userspace can discover * which protection key was set on the PTE. * * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ u32 pkey = vma_pkey(vma); __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); } else { __bad_area(regs, error_code, address, 0, SEGV_ACCERR); } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { struct task_struct *tsk = current; unsigned lsb = 0; pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) return 0; if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; /* * Read or write was blocked by protection keys. This is * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ if (error_code & X86_PF_PK) return 1; /* * SGX hardware blocked the access. This usually happens * when the enclave memory contents have been destroyed, like * after a suspend/resume cycle. In any case, the kernel can't * fix the cause of the fault. Handle the fault as an access * error even in cases where no actual access violation * occurred. This allows userspace to rebuild the enclave in * response to the signal. */ if (unlikely(error_code & X86_PF_SGX)) return 1; /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return 1; /* * Shadow stack accesses (PF_SHSTK=1) are only permitted to * shadow stack VMAs. All other accesses result in an error. */ if (error_code & X86_PF_SHSTK) { if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ if (unlikely(!vma_is_accessible(vma))) return 1; return 0; } bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above * TASK_SIZE_MAX, but is not considered part of the kernel * address space. */ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) return false; return address >= TASK_SIZE_MAX; } /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that * ran in userspace or the kernel. */ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { /* * Protection keys exceptions only happen on user pages. We * have no user pages in the kernel portion of the address * space, so do not expect them here. */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * Before doing this on-demand faulting, ensure that the * fault is not any of the following: * 1. A fault on a PTE with a reserved bit set. * 2. A fault caused by a user-mode access. (Do not demand- * fault kernel memory due to user-mode accesses). * 3. A fault caused by a page-level protection violation. * (A demand fault would be on a non-present page which * would have X86_PF_PROT==0). * * This is only needed to close a race condition on x86-32 in * the vmalloc mapping/unmapping code. See the comment above * vmalloc_fault() for details. On x86-64 the race does not * exist as the vmalloc mappings don't need to be synchronized * there. */ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } #endif if (is_f00f_bug(regs, hw_error_code, address)) return; /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Note, despite being a "bad area", there are quite a few * acceptable reasons to get here, such as erratum fixups * and handling kernel code that can fault, like get_user(). * * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_kern_addr_fault); /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. * The one exception is AC flag handling, which is, per the x86 * architecture, special for WRUSS. */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; mm = tsk->mm; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* * Whoops, this is kernel mode code trying to execute from * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; page_fault_oops(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user * pages in the user address space. The odd case here is WRUSS, * which, according to the preliminary documentation, does not respect * SMAP and will have the USER bit set so, in all cases, SMAP * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { /* * No extable entry here. This was a kernel access to an * invalid pointer. get_kernel_nofault() will not get here. */ page_fault_oops(regs, error_code, address); return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } /* Legacy check - remove this after verifying that it doesn't trigger */ if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { bad_area_nosemaphore(regs, error_code, address); return; } local_irq_enable(); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * Read-only permissions can not be expressed in shadow stack PTEs. * Treat all shadow stack accesses as WRITE faults. This ensures * that the MM will prepare everything (e.g., break COW) such that * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* * We set FAULT_FLAG_USER based on the register state, not * based on X86_PF_USER. User space accesses that cause * system page faults are still user accesses. */ if (user_mode(regs)) flags |= FAULT_FLAG_USER; #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The * vsyscall page is at a high address (>PAGE_OFFSET), but is * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. * * PKRU never rejects instruction fetches, so we don't need * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(error_code, regs, address)) return; } #endif if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); if (!vma) goto lock_mmap; if (unlikely(access_error(error_code, vma))) { vma_end_read(vma); goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { /* * Quick path to respond to signals. The core mm code * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return; /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: if (likely(!(fault & VM_FAULT_ERROR))) return; if (fatal_signal_pending(current) && !user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, 0, 0, ARCH_DEFAULT_PKEY); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, SEGV_MAPERR, ARCH_DEFAULT_PKEY); return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); else BUG(); } } NOKPROBE_SYMBOL(do_user_addr_fault); static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!trace_pagefault_enabled()) return; if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { irqentry_state_t state; unsigned long address; address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(&current->mm->mmap_lock); /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the * interrupted context had IF=1. We are also relying on the KVM * async pf type field and CR2 being read consistently instead of * getting values from real and async page faults mixed up. * * Fingers crossed. * * The async #PF handling code takes care of idtentry handling * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps * debuggability. */ state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); irqentry_exit(regs, state); }
1 1 1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 // SPDX-License-Identifier: GPL-2.0 /* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/aead.h> #include <crypto/aes.h> #include <crypto/rng.h> #include "crypto.h" #include "msg.h" #include "bcast.h" #define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ /* * TIPC Key ids */ enum { KEY_MASTER = 0, KEY_MIN = KEY_MASTER, KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, }; /* * TIPC Crypto statistics */ enum { STAT_OK, STAT_NOK, STAT_ASYNC, STAT_ASYNC_OK, STAT_ASYNC_NOK, STAT_BADKEYS, /* tx only */ STAT_BADMSGS = STAT_BADKEYS, /* rx only */ STAT_NOKEYS, STAT_SWITCHES, MAX_STATS, }; /* TIPC crypto statistics' header */ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", "async_nok", "badmsgs", "nokeys", "switches"}; /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 * +-----+-----+-----+-----+-----+-----+-----+-----+ * key: | (reserved)|passive idx| active idx|pending idx| * +-----+-----+-----+-----+-----+-----+-----+-----+ */ struct tipc_key { #define KEY_BITS (2) #define KEY_MASK ((1 << KEY_BITS) - 1) union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 pending:2, active:2, passive:2, /* rx only */ reserved:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:2, passive:2, /* rx only */ active:2, pending:2; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; u8 keys; }; }; /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs * @tfm: cipher handle/key * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; struct list_head list; }; /** * struct tipc_aead - TIPC AEAD key structure * @tfm_entry: per-cpu pointer to one entry in TFM list * @crypto: TIPC crypto owns this key * @cloned: reference to the source key in case cloning * @users: the number of the key users (TX/RX) * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ struct tipc_aead { #define TIPC_AEAD_HINT_LEN (5) struct tipc_tfm * __percpu *tfm_entry; struct tipc_crypto *crypto; struct tipc_aead *cloned; atomic_t users; u32 salt; u8 authsize; u8 mode; char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; struct tipc_aead_key *key; u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; } ____cacheline_aligned; /** * struct tipc_crypto_stats - TIPC Crypto statistics * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; }; /** * struct tipc_crypto - TIPC TX/RX crypto structure * @net: struct net * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key_gen: TX/RX key generation * @key: the key states * @skey_mode: session key's mode * @skey: received session key * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) * @working: the crypto is working or not * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; u16 key_gen; struct tipc_key key; u8 skey_mode; struct tipc_aead_key *skey; struct workqueue_struct *wq; struct delayed_work work; #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; union { struct { u8 working:1; u8 key_master:1; u8 legacy_user:1; u8 nokey: 1; }; u8 flags; }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; /* struct tipc_crypto_tx_ctx - TX context for callbacks */ struct tipc_crypto_tx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; struct tipc_media_addr dst; }; /* struct tipc_crypto_rx_ctx - RX context for callbacks */ struct tipc_crypto_rx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; }; static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); static inline void tipc_aead_put(struct tipc_aead *aead); static void tipc_aead_free(struct rcu_head *rp); static int tipc_aead_users(struct tipc_aead __rcu *aead); static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode); static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg); static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx); static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ } while (0) #define tipc_crypto_key_detach(rcu_ptr, lock) \ tipc_aead_rcu_replace((rcu_ptr), NULL, lock) /** * tipc_aead_key_validate - Validate a AEAD user key * @ukey: pointer to user key data * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ if (strcmp(ukey->alg_name, "gcm(aes)")) { GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && keylen != TIPC_AES_GCM_KEY_SIZE_256)) { GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); return -EKEYREJECTED; } return 0; } /** * tipc_aead_key_generate - Generate new session key * @skey: input/output key with new content * * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_key_generate(struct tipc_aead_key *skey) { int rc = 0; /* Fill the key's content with a random value via RNG cipher */ rc = crypto_get_default_rng(); if (likely(!rc)) { rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, skey->keylen); crypto_put_default_rng(); } return rc; } static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) tmp = NULL; rcu_read_unlock(); return tmp; } static inline void tipc_aead_put(struct tipc_aead *aead) { if (aead && refcount_dec_and_test(&aead->refcnt)) call_rcu(&aead->rcu, tipc_aead_free); } /** * tipc_aead_free - Release AEAD key incl. all the TFMs in the list * @rp: rcu head pointer */ static void tipc_aead_free(struct rcu_head *rp) { struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); struct tipc_tfm *tfm_entry, *head, *tmp; if (aead->cloned) { tipc_aead_put(aead->cloned); } else { head = *get_cpu_ptr(aead->tfm_entry); put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); kfree(tfm_entry); } /* Free the head */ crypto_free_aead(head->tfm); list_del(&head->list); kfree(head); } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); kfree(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; int users = 0; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) users = atomic_read(&tmp->users); rcu_read_unlock(); return users; } static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&tmp->users, 1, lim); rcu_read_unlock(); } static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); rcu_read_unlock(); } static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) { struct tipc_aead *tmp; int cur; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) { do { cur = atomic_read(&tmp->users); if (cur == val) break; } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); } rcu_read_unlock(); } /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { struct tipc_tfm **tfm_entry; struct crypto_aead *tfm; tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); tfm = (*tfm_entry)->tfm; put_cpu_ptr(tfm_entry); return tfm; } /** * tipc_aead_init - Initiate TIPC AEAD * @aead: returned new TIPC AEAD key handle pointer * @ukey: pointer to user key data * @mode: the key mode * * Allocate a (list of) new cipher transformation (TFM) with the specific user * key data if valid. The number of the allocated TFMs can be set via the sysfs * "net/tipc/max_tfms" first. * Also, all the other AEAD data are also initialized. * * Return: 0 if the initiation is successful, otherwise: < 0 */ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode) { struct tipc_tfm *tfm_entry, *head; struct crypto_aead *tfm; struct tipc_aead *tmp; int keylen, err, cpu; int tfm_cnt = 0; if (unlikely(*aead)) return -EEXIST; /* Allocate a new AEAD */ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; /* The key consists of two parts: [AES-KEY][SALT] */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; /* Allocate per-cpu TFM entry pointer */ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); if (!tmp->tfm_entry) { kfree_sensitive(tmp); return -ENOMEM; } /* Make a list of TFMs with the user key data */ do { tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); if (IS_ERR(tfm)) { err = PTR_ERR(tfm); break; } if (unlikely(!tfm_cnt && crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { crypto_free_aead(tfm); err = -ENOTSUPP; break; } err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); err |= crypto_aead_setkey(tfm, ukey->key, keylen); if (unlikely(err)) { crypto_free_aead(tfm); break; } tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; break; } INIT_LIST_HEAD(&tfm_entry->list); tfm_entry->tfm = tfm; /* First entry? */ if (!tfm_cnt) { head = tfm_entry; for_each_possible_cpu(cpu) { *per_cpu_ptr(tmp->tfm_entry, cpu) = head; } } else { list_add_tail(&tfm_entry->list, &head->list); } } while (++tfm_cnt < sysctl_tipc_max_tfms); /* Not any TFM is allocated? */ if (!tfm_cnt) { free_percpu(tmp->tfm_entry); kfree_sensitive(tmp); return err; } /* Form a hex string of some last bytes as the key's hint */ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); if (!tmp->key) { tipc_aead_free(&tmp->rcu); return -ENOMEM; } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); refcount_set(&tmp->refcnt, 1); *aead = tmp; return 0; } /** * tipc_aead_clone - Clone a TIPC AEAD key * @dst: dest key for the cloning * @src: source key to clone from * * Make a "copy" of the source AEAD key data to the dest, the TFMs list is * common for the keys. * A reference to the source is hold in the "cloned" pointer for the later * freeing purposes. * * Note: this must be done in cluster-key mode only! * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) { struct tipc_aead *aead; int cpu; if (!src) return -ENOKEY; if (src->mode != CLUSTER_KEY) return -EINVAL; if (unlikely(*dst)) return -EEXIST; aead = kzalloc(sizeof(*aead), GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); if (unlikely(!aead->tfm_entry)) { kfree_sensitive(aead); return -ENOMEM; } for_each_possible_cpu(cpu) { *per_cpu_ptr(aead->tfm_entry, cpu) = *per_cpu_ptr(src->tfm_entry, cpu); } memcpy(aead->hint, src->hint, sizeof(src->hint)); aead->mode = src->mode; aead->salt = src->salt; aead->authsize = src->authsize; atomic_set(&aead->users, 0); atomic64_set(&aead->seqno, 0); refcount_set(&aead->refcnt, 1); WARN_ON(!refcount_inc_not_zero(&src->refcnt)); aead->cloned = src; *dst = aead; return 0; } /** * tipc_aead_mem_alloc - Allocate memory for AEAD request operations * @tfm: cipher handle to be registered with the request * @crypto_ctx_size: size of crypto context for callback * @iv: returned pointer to IV data * @req: returned pointer to AEAD request data * @sg: returned pointer to SG lists * @nsg: number of SG lists to be allocated * * Allocate memory to store the crypto context data, AEAD request, IV and SG * lists, the memory layout is as follows: * crypto_ctx || iv || aead_req || sg[] * * Return: the pointer to the memory areas in case of success, otherwise NULL */ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg) { unsigned int iv_size, req_size; unsigned int len; u8 *mem; iv_size = crypto_aead_ivsize(tfm); req_size = sizeof(**req) + crypto_aead_reqsize(tfm); len = crypto_ctx_size; len += iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += nsg * sizeof(**sg); mem = kmalloc(len, GFP_ATOMIC); if (!mem) return NULL; *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, __alignof__(struct scatterlist)); return (void *)mem; } /** * tipc_aead_encrypt - Encrypt a message * @aead: TIPC AEAD key for the message encryption * @skb: the input/output skb * @b: TIPC bearer where the message will be delivered after the encryption * @dst: the destination media address * @__dnode: TIPC dest node if "known" * * Return: * * 0 : if the encryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct crypto_aead *tfm = tipc_aead_tfm_next(aead); struct tipc_crypto_tx_ctx *tx_ctx; struct aead_request *req; struct sk_buff *trailer; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, len, tailen, nsg, rc; void *ctx; u32 salt; u8 *iv; /* Make sure message len at least 4-byte aligned */ len = ALIGN(skb->len, 4); tailen = len - skb->len + aead->authsize; /* Expand skb tail for authentication tag: * As for simplicity, we'd have made sure skb having enough tailroom * for authentication tag @skb allocation. Even when skb is nonlinear * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", skb_tailroom(skb), tailen); } nsg = skb_cow_data(skb, tailen, &trailer); if (unlikely(nsg < 0)) { pr_err("TX: skb_cow_data() returned %d\n", nsg); return nsg; } pskb_put(skb, trailer, tailen); /* Allocate memory for the AEAD operation */ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); goto exit; } /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] * In case we're in cluster-key mode, SALT is varied by xor-ing with * the source address (or w0 of id), otherwise with the dest address * if dest is known. */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_encrypt_done, skb); tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; tx_ctx->aead = aead; tx_ctx->bearer = b; memcpy(&tx_ctx->dst, dst, sizeof(*dst)); /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; struct tipc_crypto *tx = aead->crypto; struct net *net = tx->net; switch (err) { case 0: this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); rcu_read_lock(); if (likely(test_bit(0, &b->up))) b->media->send_msg(net, skb, b, &tx_ctx->dst); else kfree_skb(skb); rcu_read_unlock(); break; case -EINPROGRESS: return; default: this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); kfree_skb(skb); break; } kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); } /** * tipc_aead_decrypt - Decrypt an encrypted message * @net: struct net * @aead: TIPC AEAD for the message decryption * @skb: the input/output skb * @b: TIPC bearer where the message has been received * * Return: * * 0 : if the decryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_crypto_rx_ctx *rx_ctx; struct aead_request *req; struct crypto_aead *tfm; struct sk_buff *unused; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, nsg, rc; void *ctx; u32 salt; u8 *iv; if (unlikely(!aead)) return -ENOKEY; nsg = skb_cow_data(skb, 0, &unused); if (unlikely(nsg < 0)) { pr_err("RX: skb_cow_data() returned %d\n", nsg); return nsg; } /* Allocate memory for the AEAD operation */ tfm = tipc_aead_tfm_next(aead); ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); goto exit; } /* Reconstruct IV: */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_decrypt_done, skb); rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; rx_ctx->aead = aead; rx_ctx->bearer = b; /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do decrypt */ rc = crypto_aead_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; struct net *net = aead->crypto->net; switch (err) { case 0: this_cpu_inc(stats->stat[STAT_ASYNC_OK]); break; case -EINPROGRESS: return; default: this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); break; } kfree(rx_ctx); tipc_crypto_rcv_complete(net, aead, b, &skb, err); if (likely(skb)) { if (likely(test_bit(0, &b->up))) tipc_rcv(net, skb, b); else kfree_skb(skb); } tipc_bearer_put(b); } static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) { return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; } /** * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { struct tipc_ehdr *ehdr; int ehsz; if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) return false; ehdr = (struct tipc_ehdr *)skb->data; if (unlikely(ehdr->version != TIPC_EVERSION)) return false; ehsz = tipc_ehdr_size(ehdr); if (unlikely(!pskb_may_pull(skb, ehsz))) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; return true; } /** * tipc_ehdr_build - Build TIPC encryption message header * @net: struct net * @aead: TX AEAD key to be used for the message encryption * @tx_key: key id used for the message encryption * @skb: input/output message skb * @__rx: RX crypto handle if dest is "known" * * Return: the header size if the building is successful, otherwise < 0 */ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_ehdr *ehdr; u32 user = msg_user(hdr); u64 seqno; int ehsz; /* Make room for encryption header */ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; WARN_ON(skb_headroom(skb) < ehsz); ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); /* Obtain a seqno first: * Use the key seqno (= cluster wise) if dest is unknown or we're in * cluster key mode, otherwise it's better for a per-peer seqno! */ if (!__rx || aead->mode == CLUSTER_KEY) seqno = atomic64_inc_return(&aead->seqno); else seqno = atomic64_inc_return(&__rx->sndnxt); /* Revoke the key if seqno is wrapped around */ if (unlikely(!seqno)) return tipc_crypto_key_revoke(net, tx_key); /* Word 1-2 */ ehdr->seqno = cpu_to_be64(seqno); /* Words 0, 3- */ ehdr->version = TIPC_EVERSION; ehdr->user = 0; ehdr->keepalive = 0; ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; ehdr->rx_nokey = (__rx) ? __rx->nokey : 0; ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; switch (user) { case LINK_CONFIG: ehdr->user = LINK_CONFIG; memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); break; default: if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { ehdr->user = LINK_PROTOCOL; ehdr->keepalive = msg_is_keepalive(hdr); } ehdr->addr = hdr->hdr[3]; break; } return ehsz; } static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending) { struct tipc_key old = c->key; char buf[32]; c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | ((new_active & KEY_MASK) << (KEY_BITS)) | ((new_pending & KEY_MASK)); pr_debug("%s: key changing %s ::%pS\n", c->name, tipc_key_change_dump(old, c->key, buf), __builtin_return_address(0)); } /** * tipc_crypto_key_init - Initiate a new user / AEAD key * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. * * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; /* Initiate with the new user key */ rc = tipc_aead_init(&aead, ukey, mode); /* Attach it to the crypto */ if (likely(!rc)) { rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } return rc; } /** * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key) { struct tipc_key key; int rc = -EBUSY; u8 new_key; spin_lock_bh(&c->lock); key = c->key; if (master_key) { new_key = KEY_MASTER; goto attach; } if (key.active && key.passive) goto exit; if (key.pending) { if (tipc_aead_users(c->aead[key.pending]) > 0) goto exit; /* if (pos): ok with replacing, will be aligned when needed */ /* Replace it */ new_key = key.pending; } else { if (pos) { if (key.active && pos != key_next(key.active)) { key.passive = pos; new_key = pos; goto attach; } else if (!key.active && !key.passive) { key.pending = pos; new_key = pos; goto attach; } } key.pending = key_next(key.active ?: key.passive); new_key = key.pending; } attach: aead->crypto = c; aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen; tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); if (likely(c->key.keys != key.keys)) tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; c->nokey = 0; c->key_master |= master_key; rc = new_key; exit: spin_unlock_bh(&c->lock); return rc; } void tipc_crypto_key_flush(struct tipc_crypto *c) { struct tipc_crypto *tx, *rx; int k; spin_lock_bh(&c->lock); if (is_rx(c)) { /* Try to cancel pending work */ rx = c; tx = tipc_net(rx->net)->crypto_tx; if (cancel_delayed_work(&rx->work)) { kfree(rx->skey); rx->skey = NULL; atomic_xchg(&rx->key_distr, 0); tipc_node_put(rx->node); } /* RX stopping => decrease TX key users if any */ k = atomic_xchg(&rx->peer_rx_active, 0); if (k) { tipc_aead_users_dec(tx->aead[k], 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; } } c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); atomic64_set(&c->sndnxt, 0); spin_unlock_bh(&c->lock); } /** * tipc_crypto_key_try_align - Align RX keys if possible * @rx: RX crypto handle * @new_pending: new pending slot if aligned (= TX key from peer) * * Peer has used an unknown key slot, this only happens when peer has left and * rejoned, or we are newcomer. * That means, there must be no active key but a pending key at unaligned slot. * If so, we try to move the pending key to the new slot. * Note: A potential passive key can exist, it will be shifted correspondingly! * * Return: "true" if key is successfully aligned, otherwise "false" */ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) { struct tipc_aead *tmp1, *tmp2 = NULL; struct tipc_key key; bool aligned = false; u8 new_passive = 0; int x; spin_lock(&rx->lock); key = rx->key; if (key.pending == new_pending) { aligned = true; goto exit; } if (key.active) goto exit; if (!key.pending) goto exit; if (tipc_aead_users(rx->aead[key.pending]) > 0) goto exit; /* Try to "isolate" this pending key first */ tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); if (!refcount_dec_if_one(&tmp1->refcnt)) goto exit; rcu_assign_pointer(rx->aead[key.pending], NULL); /* Move passive key if any */ if (key.passive) { tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } /* Re-allocate the key(s) */ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); rcu_assign_pointer(rx->aead[new_pending], tmp1); if (new_passive) rcu_assign_pointer(rx->aead[new_passive], tmp2); refcount_set(&tmp1->refcnt, 1); aligned = true; pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending, new_pending); exit: spin_unlock(&rx->lock); return aligned; } /** * tipc_crypto_key_pick_tx - Pick one TX key for message decryption * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before * on the same message (i.e. recursive). * * Return: the TX AEAD key handle in case of success, otherwise NULL */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; struct tipc_key key = tx->key; u8 k, i = 0; /* Initialize data if not yet */ if (!skb_cb->tx_clone_deferred) { skb_cb->tx_clone_deferred = 1; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); } skb_cb->tx_clone_ctx.rx = rx; if (++skb_cb->tx_clone_ctx.recurs > 2) return NULL; /* Pick one TX key */ spin_lock(&tx->lock); if (tx_key == KEY_MASTER) { aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); goto done; } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); if (!k) continue; aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); if (!aead) continue; if (aead->mode != CLUSTER_KEY || aead == skb_cb->tx_clone_ctx.last) { aead = NULL; continue; } /* Ok, found one cluster key */ skb_cb->tx_clone_ctx.last = aead; WARN_ON(skb->next); skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); break; } while (++i < 3); done: if (likely(aead)) WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; } /** * tipc_crypto_key_synch: Synch own key data according to peer key status * @rx: RX crypto handle * @skb: TIPCv2 message buffer (incl. the ehdr from peer) * * This function updates the peer node related data as the peer RX active key * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * * It also considers if peer has no key, then we need to make own master key * (if any) taking over i.e. starting grace period and also trigger key * distributing process. * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) { struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_msg *hdr = buf_msg(skb); u32 self = tipc_own_addr(rx->net); u8 cur, new; unsigned long delay; /* Update RX 'key_master' flag according to peer, also mark "legacy" if * a peer has no master key. */ rx->key_master = ehdr->master_key; if (!rx->key_master) tx->legacy_user = 1; /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; /* Case 1: Peer has no keys, let's make master key take over */ if (ehdr->rx_nokey) { /* Set or extend grace period */ tx->timer2 = jiffies; /* Schedule key distributing for the peer if not yet */ if (tx->key.keys && !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) { get_random_bytes(&delay, 2); delay %= 5; delay = msecs_to_jiffies(500 * ++delay); if (queue_delayed_work(tx->wq, &rx->work, delay)) tipc_node_get(rx->node); } } else { /* Cancel a pending key distributing if any */ atomic_xchg(&rx->key_distr, 0); } /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && cur != new && atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) { if (new) tipc_aead_users_inc(tx->aead[new], INT_MAX); if (cur) tipc_aead_users_dec(tx->aead[cur], 0); atomic64_set(&rx->sndnxt, 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; pr_debug("%s: key users changed %d-- %d++, peer %s\n", tx->name, cur, new, rx->name); } } static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; } int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, struct tipc_node *node) { struct tipc_crypto *c; if (*crypto) return -EEXIST; /* Allocate crypto */ c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return -ENOMEM; /* Allocate workqueue on TX */ if (!node) { c->wq = alloc_ordered_workqueue("tipc_crypto", 0); if (!c->wq) { kfree(c); return -ENOMEM; } } /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { if (c->wq) destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } c->flags = 0; c->net = net; c->node = node; get_random_bytes(&c->key_gen, 2); tipc_crypto_key_set_state(c, 0, 0, 0); atomic_set(&c->key_distr, 0); atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : tipc_own_id_string(c->net)); if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); else INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; } void tipc_crypto_stop(struct tipc_crypto **crypto) { struct tipc_crypto *c = *crypto; u8 k; if (!c) return; /* Flush any queued works & destroy wq */ if (is_tx(c)) { c->rekeying_intv = 0; cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); } /* Release AEAD keys */ rcu_read_lock(); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_aead_put(rcu_dereference(c->aead[k])); rcu_read_unlock(); pr_debug("%s: has been stopped\n", c->name); /* Free this crypto statistics */ free_percpu(c->stats); *crypto = NULL; kfree_sensitive(c); } void tipc_crypto_timeout(struct tipc_crypto *rx) { struct tipc_net *tn = tipc_net(rx->net); struct tipc_crypto *tx = tn->crypto_tx; struct tipc_key key; int cmd; /* TX pending: taking all users & stable -> active */ spin_lock(&tx->lock); key = tx->key; if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) goto s1; if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) goto s1; if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME)) goto s1; tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); if (key.active) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", tx->name, key.pending); s1: spin_unlock(&tx->lock); /* RX pending: having user -> active */ spin_lock(&rx->lock); key = rx->key; if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) goto s2; if (key.active) key.passive = key.active; key.active = key.pending; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", rx->name, key.pending); goto s5; s2: /* RX pending: not working -> remove */ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10) goto s3; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock); pr_debug("%s: key[%d] is removed\n", rx->name, key.pending); goto s5; s3: /* RX active: timed out or no user -> pending */ if (!key.active) goto s4; if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) && tipc_aead_users(rx->aead[key.active]) > 0) goto s4; if (key.pending) key.passive = key.active; else key.pending = key.active; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, 0, key.pending); tipc_aead_users_set(rx->aead[key.pending], 0); pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active); goto s5; s4: /* RX passive: outdated or not working -> free */ if (!key.passive) goto s5; if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) && tipc_aead_users(rx->aead[key.passive]) > -10) goto s5; tipc_crypto_key_set_state(rx, 0, key.active, key.pending); tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); pr_debug("%s: key[%d] is freed\n", rx->name, key.passive); s5: spin_unlock(&rx->lock); /* Relax it here, the flag will be set again if it really is, but only * when we are not in grace period for safety! */ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) tx->legacy_user = 0; /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; cmd = sysctl_tipc_max_tfms; sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; tipc_crypto_do_cmd(rx->net, cmd); } static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type) { struct sk_buff *skb; skb = skb_clone(_skb, GFP_ATOMIC); if (skb) { TIPC_SKB_CB(skb)->xmit_type = type; tipc_crypto_xmit(net, &skb, b, dst, __dnode); if (skb) b->media->send_msg(net, skb, b, dst); } } /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net * @skb: input/output message skb pointer * @b: bearer used for xmit later * @dst: destination media address * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then * encrypt the original TIPC message by using the pending, master or active * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! * * Return: * * 0 : the encryption has succeeded (or no encryption) * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made * * -ENOKEK : the encryption has failed due to no key * * -EKEYREVOKED : the encryption has failed due to key revoked * * -ENOMEM : the encryption has failed due to no memory * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats = tx->stats; struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; u32 user = msg_user(hdr); u32 type = msg_type(hdr); int rc = -ENOKEY; u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } if (user == LINK_CONFIG || user == LINK_PROTOCOL) tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_PROBING); } /* Master key if this is a *vital* message or in grace period */ if (tx->key_master) { tx_key = KEY_MASTER; if (!key.active) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { pr_debug("%s: gracing for msg (%d %d)\n", tx->name, user, type); goto encrypt; } if (user == LINK_CONFIG || (user == LINK_PROTOCOL && type == RESET_MSG) || (user == MSG_CRYPTO && type == KEY_DISTR_MSG) || time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { if (__rx && __rx->key_master && !atomic_read(&__rx->peer_rx_active)) goto encrypt; if (!__rx) { if (likely(!tx->legacy_user)) goto encrypt; tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_GRACING); } } } /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } goto exit; encrypt: aead = tipc_aead_get(tx->aead[tx_key]); if (unlikely(!aead)) goto exit; rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); if (likely(rc > 0)) rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); exit: switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) this_cpu_inc(stats->stat[STAT_NOKEYS]); else if (rc == -EKEYREVOKED) this_cpu_inc(stats->stat[STAT_BADKEYS]); kfree_skb(*skb); *skb = NULL; break; } tipc_aead_put(aead); return rc; } /** * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer * @net: struct net * @rx: RX crypto handle * @skb: input/output message skb pointer * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or * as the callback, the encryption header and auth tag will be trimed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX * cluster key(s) can be taken for decryption (- recursive). * * Return: * * 0 : the decryption has successfully completed * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made * * -ENOKEY : the decryption has failed due to no key * * -EBADMSG : the decryption has failed due to bad message * * -ENOMEM : the decryption has failed due to no memory * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats; struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; u8 tx_key, n; tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || tx_key == key.passive) goto decrypt; /* Unknown key, let's try to align RX key(s) */ if (tipc_crypto_key_try_align(rx, tx_key)) goto decrypt; pick_tx: /* No key suitable? Try to pick one from TX... */ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; decrypt: rcu_read_lock(); if (!aead) aead = tipc_aead_get(rx->aead[tx_key]); rc = tipc_aead_decrypt(net, aead, *skb, b); rcu_read_unlock(); exit: stats = ((rx) ?: tx)->stats; switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) { kfree_skb(*skb); *skb = NULL; if (rx) { /* Mark rx->nokey only if we dont have a * pending received session key, nor a newer * one i.e. in the next slot. */ n = key_next(tx_key); rx->nokey = !(rx->skey || rcu_access_pointer(rx->aead[n])); pr_debug_ratelimited("%s: nokey %d, key %d/%x\n", rx->name, rx->nokey, tx_key, rx->key.keys); tipc_node_put(rx->node); } this_cpu_inc(stats->stat[STAT_NOKEYS]); return rc; } else if (rc == -EBADMSG) { this_cpu_inc(stats->stat[STAT_BADMSGS]); } break; } tipc_crypto_rcv_complete(net, aead, b, skb, rc); return rc; } static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); struct tipc_crypto *rx = aead->crypto; struct tipc_aead *tmp = NULL; struct tipc_ehdr *ehdr; struct tipc_node *n; /* Is this completed by TX? */ if (unlikely(is_tx(aead->crypto))) { rx = skb_cb->tx_clone_ctx.rx; pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, (*skb)->next, skb_cb->flags); pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, aead->crypto->aead[1], aead->crypto->aead[2], aead->crypto->aead[3]); if (unlikely(err)) { if (err == -EBADMSG && (*skb)->next) tipc_rcv(net, (*skb)->next, b); goto free_skb; } if (likely((*skb)->next)) { kfree_skb((*skb)->next); (*skb)->next = NULL; } ehdr = (struct tipc_ehdr *)(*skb)->data; if (!rx) { WARN_ON(ehdr->user != LINK_CONFIG); n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, true); rx = tipc_node_crypto_rx(n); if (unlikely(!rx)) goto free_skb; } /* Ignore cloning if it was TX master key */ if (ehdr->tx_key == KEY_MASTER) goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; WARN_ON(!refcount_inc_not_zero(&tmp->refcnt)); if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } tipc_aead_put(aead); aead = tmp; } if (unlikely(err)) { tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN); goto free_skb; } /* Set the RX key's user */ tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1); /* Mark this point, RX works */ rx->timer1 = jiffies; rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; /* Mark this point, RX passive still works */ if (rx->key.passive && ehdr->tx_key == rx->key.passive) rx->timer2 = jiffies; skb_reset_network_header(*skb); skb_pull(*skb, tipc_ehdr_size(ehdr)); if (pskb_trim(*skb, (*skb)->len - aead->authsize)) goto free_skb; /* Validate TIPCv2 message */ if (unlikely(!tipc_msg_validate(skb))) { pr_err_ratelimited("Packet dropped after decryption!\n"); goto free_skb; } /* Ok, everything's fine, try to synch own keys according to peers' */ tipc_crypto_key_synch(rx, *skb); /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */ skb_cb = TIPC_SKB_CB(*skb); /* Mark skb decrypted */ skb_cb->decrypted = 1; /* Clear clone cxt if any */ if (likely(!skb_cb->tx_clone_deferred)) goto exit; skb_cb->tx_clone_deferred = 0; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); goto exit; free_skb: kfree_skb(*skb); *skb = NULL; exit: tipc_aead_put(aead); if (rx) tipc_node_put(rx->node); } static void tipc_crypto_do_cmd(struct net *net, int cmd) { struct tipc_net *tn = tipc_net(net); struct tipc_crypto *tx = tn->crypto_tx, *rx; struct list_head *p; unsigned int stat; int i, j, cpu; char buf[200]; /* Currently only one command is supported */ switch (cmd) { case 0xfff1: goto print_stats; default: return; } print_stats: /* Print a header */ pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); /* Print key status */ pr_info("Key status:\n"); pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), tipc_crypto_key_dump(tx, buf)); rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), tipc_crypto_key_dump(rx, buf)); } rcu_read_unlock(); /* Print crypto statistics */ for (i = 0, j = 0; i < MAX_STATS; i++) j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); pr_info("Counter %s", buf); memset(buf, '-', 115); buf[115] = '\0'; pr_info("%s\n", buf); j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); j = scnprintf(buf, 200, "RX(%7.7s) ", tipc_node_get_id_str(rx->node)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } } rcu_read_unlock(); pr_info("\n======================== Done ========================\n"); } static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) { struct tipc_key key = c->key; struct tipc_aead *aead; int k, i = 0; char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { if (k == KEY_MASTER) { if (is_rx(c)) continue; if (time_before(jiffies, c->timer2 + TIPC_TX_GRACE_PERIOD)) s = "ACT"; else s = "PAS"; } else { if (k == key.passive) s = "PAS"; else if (k == key.active) s = "ACT"; else if (k == key.pending) s = "PEN"; else s = "-"; } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); aead = rcu_dereference(c->aead[k]); if (aead) i += scnprintf(buf + i, 200 - i, "{\"0x...%s\", \"%s\"}/%d:%d", aead->hint, (aead->mode == CLUSTER_KEY) ? "c" : "p", atomic_read(&aead->users), refcount_read(&aead->refcnt)); rcu_read_unlock(); i += scnprintf(buf + i, 200 - i, "\n"); } if (is_rx(c)) i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", atomic_read(&c->peer_rx_active)); return buf; } static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf) { struct tipc_key *key = &old; int k, i = 0; char *s; /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) s = "act"; else if (k == key->pending) s = "pen"; else s = "-"; i += scnprintf(buf + i, 32 - i, (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); key = &new; goto again; } i += scnprintf(buf + i, 32 - i, "]"); return buf; } /** * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point * @net: the struct net * @skb: the receiving message buffer */ void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb) { struct tipc_crypto *rx; struct tipc_msg *hdr; if (unlikely(skb_linearize(skb))) goto exit; hdr = buf_msg(skb); rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr)); if (unlikely(!rx)) goto exit; switch (msg_type(hdr)) { case KEY_DISTR_MSG: if (tipc_crypto_key_rcv(rx, hdr)) goto exit; break; default: break; } tipc_node_put(rx->node); exit: kfree_skb(skb); } /** * tipc_crypto_key_distr - Distribute a TX key * @tx: the TX crypto * @key: the key's index * @dest: the destination tipc node, = NULL if distributing to all nodes * * Return: 0 in case of success, otherwise < 0 */ int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest) { struct tipc_aead *aead; u32 dnode = tipc_node_get_addr(dest); int rc = -ENOKEY; if (!sysctl_tipc_key_exchange_enabled) return 0; if (key) { rcu_read_lock(); aead = tipc_aead_get(tx->aead[key]); if (likely(aead)) { rc = tipc_crypto_key_xmit(tx->net, aead->key, aead->gen, aead->mode, dnode); tipc_aead_put(aead); } rcu_read_unlock(); } return rc; } /** * tipc_crypto_key_xmit - Send a session key * @net: the struct net * @skey: the session key to be sent * @gen: the key's generation * @mode: the key's mode * @dnode: the destination node address, = 0 if broadcasting to all nodes * * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG' * as its data section, then xmit-ed through the uc/bc link. * * Return: 0 in case of success, otherwise < 0 */ static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode) { struct sk_buff_head pkts; struct tipc_msg *hdr; struct sk_buff *skb; u16 size, cong_link_cnt; u8 *data; int rc; size = tipc_aead_key_size(skey); skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = buf_msg(skb); tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG, INT_H_SIZE, dnode); msg_set_size(hdr, INT_H_SIZE + size); msg_set_key_gen(hdr, gen); msg_set_key_mode(hdr, mode); data = msg_data(hdr); *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen); memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME); memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key, skey->keylen); __skb_queue_head_init(&pkts); __skb_queue_tail(&pkts, skb); if (dnode) rc = tipc_node_xmit(net, &pkts, dnode, 0); else rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt); return rc; } /** * tipc_crypto_key_rcv - Receive a session key * @rx: the RX crypto * @hdr: the TIPC v2 message incl. the receiving session key in its data * * This function retrieves the session key in the message from peer, then * schedules a RX work to attach the key to the corresponding RX crypto. * * Return: "true" if the key has been scheduled for attaching, otherwise * "false". */ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) { struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_aead_key *skey = NULL; u16 key_gen = msg_key_gen(hdr); u32 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); unsigned int keylen; /* Verify whether the size can exist in the packet */ if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { pr_debug("%s: message data size is too small\n", rx->name); goto exit; } keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || keylen > TIPC_AEAD_KEY_SIZE_MAX)) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); goto exit_unlock; } /* Copy key from msg data */ skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ exit_unlock: spin_unlock(&rx->lock); exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; return false; } /** * tipc_crypto_work_rx - Scheduled RX works handler * @work: the struct RX work * * The function processes the previous scheduled works i.e. distributing TX key * or attaching a received session key on RX crypto. */ static void tipc_crypto_work_rx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; unsigned long delay = msecs_to_jiffies(5000); bool resched = false; u8 key; int rc; /* Case 1: Distribute TX key to peer if scheduled */ if (atomic_cmpxchg(&rx->key_distr, KEY_DISTR_SCHED, KEY_DISTR_COMPL) == KEY_DISTR_SCHED) { /* Always pick the newest one for distributing */ key = tx->key.pending ?: tx->key.active; rc = tipc_crypto_key_distr(tx, key, rx->node); if (unlikely(rc)) pr_warn("%s: unable to distr key[%d] to %s, err %d\n", tx->name, key, tipc_node_get_id_str(rx->node), rc); /* Sched for key_distr releasing */ resched = true; } else { atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0); } /* Case 2: Attach a pending received session key from peer if any */ if (rx->skey) { rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false); if (unlikely(rc < 0)) pr_warn("%s: unable to attach received skey, err %d\n", rx->name, rc); switch (rc) { case -EBUSY: case -ENOMEM: /* Resched the key attaching */ resched = true; break; default: synchronize_rcu(); kfree(rx->skey); rx->skey = NULL; break; } } if (resched && queue_delayed_work(tx->wq, &rx->work, delay)) return; tipc_node_put(rx->node); } /** * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval * @tx: TX crypto * @changed: if the rekeying needs to be rescheduled with new interval * @new_intv: new rekeying interval (when "changed" = true) */ void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, u32 new_intv) { unsigned long delay; bool now = false; if (changed) { if (new_intv == TIPC_REKEYING_NOW) now = true; else tx->rekeying_intv = new_intv; cancel_delayed_work_sync(&tx->work); } if (tx->rekeying_intv || now) { delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); } } /** * tipc_crypto_work_tx - Scheduled TX works handler * @work: the struct TX work * * The function processes the previous scheduled work, i.e. key rekeying, by * generating a new session key based on current one, then attaching it to the * TX crypto and finally distributing it to peers. It also re-schedules the * rekeying if needed. */ static void tipc_crypto_work_tx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); struct tipc_aead_key *skey = NULL; struct tipc_key key = tx->key; struct tipc_aead *aead; int rc = -ENOMEM; if (unlikely(key.pending)) goto resched; /* Take current key as a template */ rcu_read_lock(); aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); if (unlikely(!aead)) { rcu_read_unlock(); /* At least one key should exist for securing */ return; } /* Lets duplicate it first */ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); rcu_read_unlock(); /* Now, generate new key, initiate & distribute it */ if (likely(skey)) { rc = tipc_aead_key_generate(skey) ?: tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); if (likely(rc > 0)) rc = tipc_crypto_key_distr(tx, rc, NULL); kfree_sensitive(skey); } if (unlikely(rc)) pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); resched: /* Re-schedule rekeying if any */ tipc_crypto_rekeying_sched(tx, false, 0); }
3181 635 2706 3385 84 2161 13 68 325 1 110 3025 10 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-instrumented.sh // DO NOT MODIFY THIS FILE DIRECTLY /* * This file provoides atomic operations with explicit instrumentation (e.g. * KASAN, KCSAN), which should be used unless it is necessary to avoid * instrumentation. Where it is necessary to aovid instrumenation, the * raw_atomic*() operations should be used. */ #ifndef _LINUX_ATOMIC_INSTRUMENTED_H #define _LINUX_ATOMIC_INSTRUMENTED_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/instrumented.h> /** * atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_read() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read(v); } /** * atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline int atomic_read_acquire(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_read_acquire(v); } /** * atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_set() there. * * Return: Nothing. */ static __always_inline void atomic_set(atomic_t *v, int i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_set(v, i); } /** * atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_set_release(atomic_t *v, int i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_set_release(v, i); } /** * atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add() there. * * Return: Nothing. */ static __always_inline void atomic_add(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_add(i, v); } /** * atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return(i, v); } /** * atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_acquire(i, v); } /** * atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_release(i, v); } /** * atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_return_relaxed(i, v); } /** * atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add(i, v); } /** * atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_acquire(i, v); } /** * atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_release(i, v); } /** * atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_relaxed(i, v); } /** * atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub() there. * * Return: Nothing. */ static __always_inline void atomic_sub(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_sub(i, v); } /** * atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return(i, v); } /** * atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_acquire(i, v); } /** * atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_release(i, v); } /** * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_return_relaxed(i, v); } /** * atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub(i, v); } /** * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_acquire(i, v); } /** * atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_release(i, v); } /** * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_sub_relaxed(i, v); } /** * atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc() there. * * Return: Nothing. */ static __always_inline void atomic_inc(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_inc(v); } /** * atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return(v); } /** * atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_acquire(v); } /** * atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_release(v); } /** * atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_return_relaxed(v); } /** * atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc(v); } /** * atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_acquire(v); } /** * atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_release(v); } /** * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_inc_relaxed(v); } /** * atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec() there. * * Return: Nothing. */ static __always_inline void atomic_dec(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_dec(v); } /** * atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return(v); } /** * atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_acquire(v); } /** * atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_release(v); } /** * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_return_relaxed(v); } /** * atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec(v); } /** * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_acquire(v); } /** * atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_release(atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_release(v); } /** * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_dec_relaxed(v); } /** * atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_and() there. * * Return: Nothing. */ static __always_inline void atomic_and(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_and(i, v); } /** * atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and(i, v); } /** * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_acquire(i, v); } /** * atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_release(i, v); } /** * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_and_relaxed(i, v); } /** * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_andnot(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_andnot(i, v); } /** * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot(i, v); } /** * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_acquire(i, v); } /** * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_release(i, v); } /** * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_andnot_relaxed(i, v); } /** * atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_or() there. * * Return: Nothing. */ static __always_inline void atomic_or(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_or(i, v); } /** * atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or(i, v); } /** * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_acquire(i, v); } /** * atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_release(i, v); } /** * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_or_relaxed(i, v); } /** * atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xor() there. * * Return: Nothing. */ static __always_inline void atomic_xor(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_xor(i, v); } /** * atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor(i, v); } /** * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_acquire(i, v); } /** * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_release(i, v); } /** * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_xor_relaxed(i, v); } /** * atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg(atomic_t *v, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg(v, new); } /** * atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_acquire(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_acquire(v, new); } /** * atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_release(atomic_t *v, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_release(v, new); } /** * atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_xchg_relaxed(atomic_t *v, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_xchg_relaxed(v, new); } /** * atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg(v, old, new); } /** * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_acquire(v, old, new); } /** * atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_release(v, old, new); } /** * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_cmpxchg_relaxed(v, old, new); } /** * atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } /** * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } /** * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } /** * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_sub_and_test(i, v); } /** * atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_dec_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_and_test(v); } /** * atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_inc_and_test(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_and_test(v); } /** * atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative(int i, atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative(i, v); } /** * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_acquire(i, v); } /** * atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_release(int i, atomic_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_release(i, v); } /** * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_add_negative_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_negative_relaxed(i, v); } /** * atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_fetch_add_unless(v, a, u); } /** * atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_add_unless(v, a, u); } /** * atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_not_zero(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_not_zero(v); } /** * atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_inc_unless_negative(v); } /** * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_unless_positive(v); } /** * atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int atomic_dec_if_positive(atomic_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_dec_if_positive(v); } /** * atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read(v); } /** * atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic64_read_acquire(v); } /** * atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set() there. * * Return: Nothing. */ static __always_inline void atomic64_set(atomic64_t *v, s64 i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set(v, i); } /** * atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_set_release() there. * * Return: Nothing. */ static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic64_set_release(v, i); } /** * atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add() there. * * Return: Nothing. */ static __always_inline void atomic64_add(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_add(i, v); } /** * atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return(i, v); } /** * atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_acquire(i, v); } /** * atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_release(i, v); } /** * atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_return_relaxed(i, v); } /** * atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add(i, v); } /** * atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_acquire(i, v); } /** * atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_release(i, v); } /** * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_relaxed(i, v); } /** * atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub() there. * * Return: Nothing. */ static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_sub(i, v); } /** * atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return(i, v); } /** * atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_acquire(i, v); } /** * atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_release(i, v); } /** * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_return_relaxed(i, v); } /** * atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub(i, v); } /** * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_acquire(i, v); } /** * atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_release(i, v); } /** * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_sub_relaxed(i, v); } /** * atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc() there. * * Return: Nothing. */ static __always_inline void atomic64_inc(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_inc(v); } /** * atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return(v); } /** * atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_acquire(v); } /** * atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_release(v); } /** * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_return_relaxed(v); } /** * atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc(v); } /** * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_acquire(v); } /** * atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_release(v); } /** * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_inc_relaxed(v); } /** * atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec() there. * * Return: Nothing. */ static __always_inline void atomic64_dec(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_dec(v); } /** * atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return(v); } /** * atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_acquire(v); } /** * atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_release(v); } /** * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_return_relaxed(v); } /** * atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec(v); } /** * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_acquire(v); } /** * atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_release(v); } /** * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_dec_relaxed(v); } /** * atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_and() there. * * Return: Nothing. */ static __always_inline void atomic64_and(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_and(i, v); } /** * atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and(i, v); } /** * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_acquire(i, v); } /** * atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_release(i, v); } /** * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_and_relaxed(i, v); } /** * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_andnot() there. * * Return: Nothing. */ static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_andnot(i, v); } /** * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot(i, v); } /** * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_acquire(i, v); } /** * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_release(i, v); } /** * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_andnot_relaxed(i, v); } /** * atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_or() there. * * Return: Nothing. */ static __always_inline void atomic64_or(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_or(i, v); } /** * atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or(i, v); } /** * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_acquire(i, v); } /** * atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_release(i, v); } /** * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_or_relaxed(i, v); } /** * atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xor() there. * * Return: Nothing. */ static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic64_xor(i, v); } /** * atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor(i, v); } /** * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_acquire(i, v); } /** * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_release(i, v); } /** * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_xor_relaxed(i, v); } /** * atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg(v, new); } /** * atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_acquire(v, new); } /** * atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_release(v, new); } /** * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_xchg_relaxed(v, new); } /** * atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg(v, old, new); } /** * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_acquire(v, old, new); } /** * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_release(v, old, new); } /** * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_cmpxchg_relaxed(v, old, new); } /** * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } /** * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } /** * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } /** * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_sub_and_test(i, v); } /** * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_and_test(v); } /** * atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_and_test(v); } /** * atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative(i, v); } /** * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_acquire(i, v); } /** * atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_release(s64 i, atomic64_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_release(i, v); } /** * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_negative_relaxed(i, v); } /** * atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_fetch_add_unless(v, a, u); } /** * atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_add_unless(v, a, u); } /** * atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_not_zero(v); } /** * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_inc_unless_negative(v); } /** * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_unless_positive(v); } /** * atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic64_dec_if_positive(v); } /** * atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read(v); } /** * atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there. * * Return: The value loaded from @v. */ static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return raw_atomic_long_read_acquire(v); } /** * atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set() there. * * Return: Nothing. */ static __always_inline void atomic_long_set(atomic_long_t *v, long i) { instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set(v, i); } /** * atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there. * * Return: Nothing. */ static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { kcsan_release(); instrument_atomic_write(v, sizeof(*v)); raw_atomic_long_set_release(v, i); } /** * atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add() there. * * Return: Nothing. */ static __always_inline void atomic_long_add(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_add(i, v); } /** * atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return(i, v); } /** * atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_acquire(i, v); } /** * atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_release(i, v); } /** * atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_return_relaxed(i, v); } /** * atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add(i, v); } /** * atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_acquire(i, v); } /** * atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_release(i, v); } /** * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_relaxed(i, v); } /** * atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub() there. * * Return: Nothing. */ static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_sub(i, v); } /** * atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return(i, v); } /** * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_acquire(i, v); } /** * atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_release(i, v); } /** * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_return_relaxed(i, v); } /** * atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub(i, v); } /** * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_acquire(i, v); } /** * atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_release(i, v); } /** * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_sub_relaxed(i, v); } /** * atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc() there. * * Return: Nothing. */ static __always_inline void atomic_long_inc(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_inc(v); } /** * atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return(v); } /** * atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_acquire(v); } /** * atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_release(v); } /** * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_return_relaxed(v); } /** * atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc(v); } /** * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_acquire(v); } /** * atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_release(v); } /** * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_inc_relaxed(v); } /** * atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec() there. * * Return: Nothing. */ static __always_inline void atomic_long_dec(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_dec(v); } /** * atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return(v); } /** * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_acquire(v); } /** * atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_release(v); } /** * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there. * * Return: The updated value of @v. */ static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_return_relaxed(v); } /** * atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec(v); } /** * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_acquire(v); } /** * atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_release(v); } /** * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_dec_relaxed(v); } /** * atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_and() there. * * Return: Nothing. */ static __always_inline void atomic_long_and(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_and(i, v); } /** * atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and(i, v); } /** * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_acquire(i, v); } /** * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_release(i, v); } /** * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_and_relaxed(i, v); } /** * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there. * * Return: Nothing. */ static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_andnot(i, v); } /** * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot(i, v); } /** * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_acquire(i, v); } /** * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_release(i, v); } /** * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_andnot_relaxed(i, v); } /** * atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_or() there. * * Return: Nothing. */ static __always_inline void atomic_long_or(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_or(i, v); } /** * atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or(i, v); } /** * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_acquire(i, v); } /** * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_release(i, v); } /** * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_or_relaxed(i, v); } /** * atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xor() there. * * Return: Nothing. */ static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); raw_atomic_long_xor(i, v); } /** * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor(i, v); } /** * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_acquire(i, v); } /** * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_release(i, v); } /** * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_xor_relaxed(i, v); } /** * atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg(atomic_long_t *v, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg(v, new); } /** * atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_acquire(v, new); } /** * atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_release(v, new); } /** * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_xchg_relaxed(v, new); } /** * atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg(v, old, new); } /** * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_acquire(v, old, new); } /** * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_release(v, old, new); } /** * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_cmpxchg_relaxed(v, old, new); } /** * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } /** * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } /** * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } /** * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_sub_and_test(i, v); } /** * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_and_test(v); } /** * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_and_test(v); } /** * atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative(i, v); } /** * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_acquire(i, v); } /** * atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_release(long i, atomic_long_t *v) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_release(i, v); } /** * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_negative_relaxed(i, v); } /** * atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there. * * Return: The original value of @v. */ static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_fetch_add_unless(v, a, u); } /** * atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_add_unless(v, a, u); } /** * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_not_zero(v); } /** * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_inc_unless_negative(v); } /** * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_unless_positive(v); } /** * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return raw_atomic_long_dec_if_positive(v); } #define xchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg(__ai_ptr, __VA_ARGS__); \ }) #define xchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define xchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_release(__ai_ptr, __VA_ARGS__); \ }) #define xchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ kcsan_release(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define cmpxchg_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg128_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \ }) #define sync_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg64_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg128_local(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \ raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define sync_try_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ kcsan_mb(); \ instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ // ce5b65e0f1f8a276268b667194581d24bed219d4
4 4 3 83 449 23 9 3 2 23 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif
494 498 585 585 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BACKING_DEV_DEFS_H #define __LINUX_BACKING_DEV_DEFS_H #include <linux/list.h> #include <linux/radix-tree.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/percpu_counter.h> #include <linux/percpu-refcount.h> #include <linux/flex_proportions.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/refcount.h> struct page; struct device; struct dentry; /* * Bits in bdi_writeback.state */ enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, NR_WB_STAT_ITEMS }; #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) /* * why some writeback work was initiated */ enum wb_reason { WB_REASON_BACKGROUND, WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done * by emergency worker, however, this is TPs userland visible * and we'll be exposing exactly the same information, * so it has a mismatch name. */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, WB_REASON_MAX, }; struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; }; #define __WB_COMPLETION_INIT(_waitq) \ (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) } /* * If one wants to wait for one or more wb_writeback_works, each work's * ->done should be set to a wb_completion defined using the following * macro. Once all work items are issued with wb_queue_work(), the caller * can wait for the completion of all using wb_wait_for_completion(). Work * items which are waited upon aren't freed automatically on completion. */ #define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq) #define DEFINE_WB_COMPLETION(cmpl, bdi) \ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) /* * Each wb (bdi_writeback) can perform writeback operations, is measured * and throttled, independently. Without cgroup writeback, each bdi * (bdi_writeback) is served by its embedded bdi->wb. * * On the default hierarchy, blkcg implicitly enables memcg. This allows * using memcg's page ownership for attributing writeback IOs, and every * memcg - blkcg combination can be served by its own wb by assigning a * dedicated wb to each memcg, which enables isolation across different * cgroups and propagation of IO back pressure down from the IO layer upto * the tasks which are generating the dirty pages to be written back. * * A cgroup wb is indexed on its bdi by the ID of the associated memcg, * refcounted with the number of inodes attached to it, and pins the memcg * and the corresponding blkcg. As the corresponding blkcg for a memcg may * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. * * Each bdi_writeback that is not embedded into the backing_dev_info must hold * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. * All the bdi tasks' dirty rate will be curbed under it. * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit * in small steps and is much more smooth/stable than the latter. */ unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; struct fprop_local_percpu completions; int dirty_exceeded; enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; struct rcu_head rcu; }; #endif }; struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; /* * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; /* * Jiffies when last process was dirty throttled on this bdi. Used by * blk-wbt. */ unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #endif wait_queue_head_t wb_waitq; struct device *dev; char dev_name[64]; struct device *owner; struct timer_list laptop_mode_wb_timer; #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif }; struct wb_lock_cookie { bool locked; unsigned long flags; }; #ifdef CONFIG_CGROUP_WRITEBACK /** * wb_tryget - try to increment a wb's refcount * @wb: bdi_writeback to get */ static inline bool wb_tryget(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) return percpu_ref_tryget(&wb->refcnt); return true; } /** * wb_get - increment a wb's refcount * @wb: bdi_writeback to get */ static inline void wb_get(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) percpu_ref_get(&wb->refcnt); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { if (WARN_ON_ONCE(!wb->bdi)) { /* * A driver bug might cause a file to be removed before bdi was * initialized. */ return; } if (wb != &wb->bdi->wb) percpu_ref_put_many(&wb->refcnt, nr); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put */ static inline void wb_put(struct bdi_writeback *wb) { wb_put_many(wb, 1); } /** * wb_dying - is a wb dying? * @wb: bdi_writeback of interest * * Returns whether @wb is unlinked and being drained. */ static inline bool wb_dying(struct bdi_writeback *wb) { return percpu_ref_is_dying(&wb->refcnt); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool wb_tryget(struct bdi_writeback *wb) { return true; } static inline void wb_get(struct bdi_writeback *wb) { } static inline void wb_put(struct bdi_writeback *wb) { } static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { } static inline bool wb_dying(struct bdi_writeback *wb) { return false; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* __LINUX_BACKING_DEV_DEFS_H */
4 4 6 6 6 5 4 5 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strscpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, &param); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer);
862 1200 1379 15 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header is used to share core functionality between the * standalone connection tracking module, and the compatibility layer's use * of connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h */ #ifndef _NF_CONNTRACK_CORE_H #define _NF_CONNTRACK_CORE_H #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_l4proto.h> /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state); int nf_conntrack_init_net(struct net *net); void nf_conntrack_cleanup_net(struct net *net); void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list); void nf_conntrack_proto_pernet_init(struct net *net); int nf_conntrack_proto_init(void); void nf_conntrack_proto_fini(void); int nf_conntrack_init_start(void); void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); int __nf_conntrack_confirm(struct sk_buff *skb); /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int nf_conntrack_confirm(struct sk_buff *skb) { struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb); int ret = NF_ACCEPT; if (ct) { if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); if (ret == NF_ACCEPT) ct = (struct nf_conn *)skb_nfct(skb); } if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } return ret; } unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) timeout = INT_MAX; if (nf_ct_is_confirmed(ct)) WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); else ct->timeout = (u32)timeout; } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); #endif /* _NF_CONNTRACK_CORE_H */
2 1 2 9 9 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; /** * batadv_algo_init() - Initialize batman-adv algorithm management data * structures */ void batadv_algo_init(void) { INIT_HLIST_HEAD(&batadv_algo_list); } /** * batadv_algo_get() - Search for algorithm with specific name * @name: algorithm name to find * * Return: Pointer to batadv_algo_ops on success, NULL otherwise */ struct batadv_algo_ops *batadv_algo_get(const char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { if (strcmp(bat_algo_ops_tmp->name, name) != 0) continue; bat_algo_ops = bat_algo_ops_tmp; break; } return bat_algo_ops; } /** * batadv_algo_register() - Register callbacks for a mesh algorithm * @bat_algo_ops: mesh algorithm callbacks to add * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); return -EEXIST; } /* all algorithms must implement all ops (for now) */ if (!bat_algo_ops->iface.enable || !bat_algo_ops->iface.disable || !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); return 0; } /** * batadv_algo_select() - Select algorithm of soft interface * @bat_priv: the bat priv with all the soft interface information * @name: name of the algorithm to select * * The algorithm callbacks for the soft interface will be set when the algorithm * with the correct name was found. Any previous selected algorithm will not be * deinitialized and the new selected algorithm will also not be initialized. * It is therefore not allowed to call batadv_algo_select outside the creation * function of the soft interface. * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_select(struct batadv_priv *bat_priv, const char *name) { struct batadv_algo_ops *bat_algo_ops; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) return -EINVAL; bat_priv->algo_ops = bat_algo_ops; return 0; } static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; char *algo_name = (char *)val; size_t name_len = strlen(algo_name); if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); if (!bat_algo_ops) { pr_err("Routing algorithm '%s' is not supported\n", algo_name); return -EINVAL; } return param_set_copystring(algo_name, kp); } static const struct kernel_param_ops batadv_param_ops_ra = { .set = batadv_param_set_ra, .get = param_get_string, }; static struct kparam_string batadv_param_string_ra = { .maxlen = sizeof(batadv_routing_algo), .string = batadv_routing_algo, }; module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); /** * batadv_algo_dump_entry() - fill in information about one supported routing * algorithm * @msg: netlink message to be sent back * @portid: Port to reply to * @seq: Sequence number of message * @bat_algo_ops: Algorithm to be dumped * * Return: Error number, or 0 on success */ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_algo_ops *bat_algo_ops) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_algo_dump() - fill in information about supported routing * algorithms * @msg: netlink message to be sent back * @cb: Parameters to the netlink request * * Return: Length of reply message. */ int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_algo_ops *bat_algo_ops; int skip = cb->args[0]; int i = 0; hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { if (i++ < skip) continue; if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, bat_algo_ops)) { i--; break; } } cb->args[0] = i; return msg->len; }
23 1294 171 1089 1233 1042 1051 835 1221 2077 1280 1240 1246 69 1319 1972 1047 13 1949 1039 947 1770 39 83 110 1388 5 1092 2 1077 1 1 76 72 1090 2182 1124 1125 196 38 159 1412 905 840 64 24 1615 22 23 91 89 75 74 73 47 26 437 433 11 403 26 425 91 92 93 92 94 93 92 92 90 91 90 857 3 903 868 76 77 75 74 2022 1747 72 1745 1998 2086 892 867 228 228 230 229 3 228 219 4 235 1238 1224 45 46 44 45 44 45 46 45 45 46 46 1427 1177 1188 1181 1211 22 1170 1186 1190 1183 1185 1228 1190 1180 1183 1288 1278 1289 1329 1273 1 1276 1591 1599 1595 93 478 475 426 74 69 74 75 1518 1592 1501 1500 1518 1506 1516 1567 1508 1501 1512 36 1797 1797 1791 854 1529 1531 16 1512 441 144 1551 550 163 12 378 291 332 504 391 50 17 90 234 108 94 332 2182 2171 66 2169 258 301 1938 1139 1813 527 10 507 1466 198 2181 1424 1262 713 254 1393 1430 6 6 14 12 11 2 7 2 1 2 1 2 2 70 69 10 10 10 10 9 10 9 2 10 10 2 9 1 9 2 2 3 3 3 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 // SPDX-License-Identifier: GPL-2.0-only /* Connection state tracking for netfilter. This is separated from, but required by, the NAT layer; it can also be used by an iptables extension. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2005-2012 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/mm.h> #include <linux/nsproxy.h> #include <linux/rculist_nulls.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netns/hash.h> #include <net/ip.h> #include "nf_internals.h" __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); struct hlist_nulls_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; u32 next_bucket; u32 avg_timeout; u32 count; u32 start_time; bool exiting; bool early_drop; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* serialize hash resizes and nf_ct_iterate_cleanup */ static DEFINE_MUTEX(nf_conntrack_mutex); #define GC_SCAN_INTERVAL_MAX (60ul * HZ) #define GC_SCAN_INTERVAL_MIN (1ul * HZ) /* clamp timeouts to this value (TCP unacked) */ #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) /* Initial bias pretending we have 100 entries at the upper bound so we don't * wakeup often just because we have three entries with a 1s timeout while still * allowing non-idle machines to wakeup more often when needed. */ #define GC_SCAN_INITIAL_COUNT 100 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) #define MIN_CHAINLEN 50u #define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ spin_lock(lock); /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics * It pairs with the smp_store_release() in nf_conntrack_all_unlock() */ if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) return; /* fast path failed, unlock */ spin_unlock(lock); /* Slow path 1) get global lock */ spin_lock(&nf_conntrack_locks_all_lock); /* Slow path 2) get the lock we want */ spin_lock(lock); /* Slow path 3) release the global lock */ spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; spin_unlock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_unlock(&nf_conntrack_locks[h2]); } /* return true if we need to recompute hashes (in case hash table was resized) */ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, unsigned int h2, unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; if (h1 <= h2) { nf_conntrack_lock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_lock_nested(&nf_conntrack_locks[h2], SINGLE_DEPTH_NESTING); } else { nf_conntrack_lock(&nf_conntrack_locks[h2]); spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } return false; } static void nf_conntrack_all_lock(void) __acquires(&nf_conntrack_locks_all_lock) { int i; spin_lock(&nf_conntrack_locks_all_lock); /* For nf_contrack_locks_all, only the latest time when another * CPU will see an update is controlled, by the "release" of the * spin_lock below. * The earliest time is not controlled, an thus KCSAN could detect * a race when nf_conntract_lock() reads the variable. * WRITE_ONCE() is used to ensure the compiler will not * optimize the write. */ WRITE_ONCE(nf_conntrack_locks_all, true); for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_lock(&nf_conntrack_locks[i]); /* This spin_unlock provides the "release" to ensure that * nf_conntrack_locks_all==true is visible to everyone that * acquired spin_lock(&nf_conntrack_locks[]). */ spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) __releases(&nf_conntrack_locks_all_lock) { /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire * critical section. * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); } unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_aligned_key_t nf_conntrack_hash_rnd; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { siphash_key_t key; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); key = nf_conntrack_hash_rnd; key.key[0] ^= zoneid; key.key[1] ^= net_hash_mix(net); return siphash((void *)tuple, offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), &key); } static u32 scale_hash(u32 hash) { return reciprocal_scale(hash, nf_conntrack_htable_size); } static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid, unsigned int size) { return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid) { return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { struct { __be16 sport; __be16 dport; } _inet_hdr, *inet_hdr; /* Actually only need first 4 bytes to get ports. */ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); if (!inet_hdr) return false; tuple->src.u.udp.port = inet_hdr->sport; tuple->dst.u.udp.port = inet_hdr->dport; return true; } static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple) { unsigned int size; const __be32 *ap; __be32 _addrs[8]; memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; switch (l3num) { case NFPROTO_IPV4: nhoff += offsetof(struct iphdr, saddr); size = 2 * sizeof(__be32); break; case NFPROTO_IPV6: nhoff += offsetof(struct ipv6hdr, saddr); size = sizeof(_addrs); break; default: return true; } ap = skb_header_pointer(skb, nhoff, size, _addrs); if (!ap) return false; switch (l3num) { case NFPROTO_IPV4: tuple->src.u3.ip = ap[0]; tuple->dst.u3.ip = ap[1]; break; case NFPROTO_IPV6: memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); break; } tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; switch (protonum) { #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_ICMP: return icmp_pkt_to_tuple(skb, dataoff, net, tuple); #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return gre_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_TCP: case IPPROTO_UDP: #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: #endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); default: break; } return true; } static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u_int8_t *protonum) { int dataoff = -1; const struct iphdr *iph; struct iphdr _iph; iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) return -1; /* Conntrack defragments packets, we might still see fragments * inside ICMP packets though. */ if (iph->frag_off & htons(IP_OFFSET)) return -1; dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; /* Check bogus IP headers */ if (dataoff > skb->len) { pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", nhoff, iph->ihl << 2, skb->len); return -1; } return dataoff; } #if IS_ENABLED(CONFIG_IPV6) static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 *protonum) { int protoff = -1; unsigned int extoff = nhoff + sizeof(struct ipv6hdr); __be16 frag_off; u8 nexthdr; if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), &nexthdr, sizeof(nexthdr)) != 0) { pr_debug("can't get nexthdr\n"); return -1; } protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); /* * (protoff == skb->len) means the packet has not data, just * IPv6 and possibly extensions headers, but it is tracked anyway */ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("can't find proto in pkt\n"); return -1; } *protonum = nexthdr; return protoff; } #endif static int get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 pf, u8 *l4num) { switch (pf) { case NFPROTO_IPV4: return ipv4_get_l4proto(skb, nhoff, l4num); #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: return ipv6_get_l4proto(skb, nhoff, l4num); #endif default: *l4num = 0; break; } return -1; } bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { u8 protonum; int protoff; protoff = get_l4proto(skb, nhoff, l3num, &protonum); if (protoff <= 0) return false; return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; switch (orig->src.l3num) { case NFPROTO_IPV4: inverse->src.u3.ip = orig->dst.u3.ip; inverse->dst.u3.ip = orig->src.u3.ip; break; case NFPROTO_IPV6: inverse->src.u3.in6 = orig->dst.u3.in6; inverse->dst.u3.in6 = orig->src.u3.in6; break; default: break; } inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; switch (orig->dst.protonum) { case IPPROTO_ICMP: return nf_conntrack_invert_icmp_tuple(inverse, orig); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_invert_icmpv6_tuple(inverse, orig); #endif } inverse->src.u.all = orig->dst.u.all; inverse->dst.u.all = orig->src.u.all; return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); /* Generate a almost-unique pseudo-id for a given conntrack. * * intentionally doesn't re-use any of the seeds used for hash * table location, we assume id gets exposed to userspace. * * Following nf_conn items do not change throughout lifetime * of the nf_conn: * * 1. nf_conn address * 2. nf_conn->master address (normally NULL) * 3. the associated net namespace * 4. the original direction tuple */ u32 nf_ct_get_id(const struct nf_conn *ct) { static siphash_aligned_key_t ct_id_seed; unsigned long a, b, c, d; net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); a = (unsigned long)ct; b = (unsigned long)ct->master; c = (unsigned long)nf_ct_net(ct); d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), &ct_id_seed); #ifdef CONFIG_64BIT return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); #else return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); #endif } EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); /* Destroy all pending expectations */ nf_ct_remove_expectations(ct); } #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) /* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) { struct nf_conn *tmpl, *p; if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); if (!tmpl) return NULL; p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); if (tmpl != p) { tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) return NULL; } tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); refcount_set(&tmpl->ct_general.use, 1); return tmpl; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { kfree(tmpl->ext); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); else kfree(tmpl); } EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_gre_conntrack(struct nf_conn *ct) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_conn *master = ct->master; if (master) nf_ct_gre_keymap_destroy(master); #endif } void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); return; } if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) destroy_gre_conntrack(ct); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ nf_ct_remove_expectations(ct); if (ct->master) nf_ct_put(ct->master); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); } static void nf_ct_delete_from_lists(struct nf_conn *ct) { nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); local_bh_enable(); } static void nf_ct_add_to_ecache_list(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); spin_lock(&cnet->ecache.dying_lock); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &cnet->ecache.dying_list); spin_unlock(&cnet->ecache.dying_lock); #endif } bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; tstamp->stop = ktime_get_real_ns(); if (timeout < 0) tstamp->stop -= jiffies_to_nsecs(-timeout); } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); nf_ct_add_to_ecache_list(ct); local_bh_enable(); nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } net = nf_ct_net(ct); if (nf_conntrack_ecache_dwork_pending(net)) nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* A conntrack can be recreated with the equal tuple, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && nf_ct_is_confirmed(ct) && net_eq(net, nf_ct_net(ct)); } static inline bool nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) { return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); } /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { if (!refcount_inc_not_zero(&ct->ct_general.use)) return; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) nf_ct_kill(ct); nf_ct_put(ct); } /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; unsigned int bucket, hsize; begin: nf_conntrack_get_ht(&ct_hash, &hsize); bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { struct nf_conn *ct; ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) return h; } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } return NULL; } /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { /* re-check key after refcount */ smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); } h = NULL; } return h; } struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; rcu_read_lock(); thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, rid, net)); out_unlock: rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[reply_hash]); } static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) { /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions * may contain stale pointers to e.g. helper that has been removed. * * The helper can't clear this because the nf_conn object isn't in * any hash and synchronize_rcu() isn't enough because associated skb * might sit in a queue. */ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); } static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) { if (!ext) return true; if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) return false; /* inserted into conntrack table, nf_ct_iterate_cleanup() * will find it. Disable nf_ct_ext_find() id check. */ WRITE_ONCE(ext->gen_id, 0); return true; } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; zone = nf_ct_zone(ct); if (!nf_ct_ext_valid_pre(ct->ext)) return -EAGAIN; local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } /* If genid has changed, we can't insert anymore because ct * extensions could have stale pointers and nf_ct_iterate_destroy * might have completed its table scan already. * * Increment of the ext genid right after this check is fine: * nf_ct_iterate_destroy blocks until locks are released. */ if (!nf_ct_ext_valid_post(ct->ext)) { err = -EAGAIN; goto out; } smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, unsigned int bytes) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(ct); if (acct) { struct nf_conn_counter *counter = acct->counter; atomic64_add(packets, &counter[dir].packets); atomic64_add(bytes, &counter[dir].bytes); } } EXPORT_SYMBOL_GPL(nf_ct_acct_add); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(loser_ct); if (acct) { struct nf_conn_counter *counter = acct->counter; unsigned int bytes; /* u32 should be fine since we must have seen one packet. */ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } } static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; refcount_inc(&ct->ct_general.use); /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); } /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; loser_ct = nf_ct_get(skb, &ctinfo); if (nf_ct_is_dying(ct)) return NF_DROP; if (((ct->status & IPS_NAT_DONE_MASK) == 0) || nf_ct_match(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); nf_ct_acct_merge(ct, ctinfo, loser_ct); nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } return NF_DROP; } /** * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry * * @skb: skb that causes the collision * @repl_idx: hash slot for reply direction * * Called when origin or reply direction had a clash. * The skb can be handled without packet drop provided the reply direction * is unique or there the existing entry has the identical tuple in both * directions. * * Caller must hold conntrack table locks to prevent concurrent updates. * * Returns NF_DROP if the clash could not be handled. */ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) { struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct net *net; zone = nf_ct_zone(loser_ct); net = nf_ct_net(loser_ct); /* Reply direction must never result in a clash, unless both origin * and reply tuples are identical. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { if (nf_ct_key_equal(h, &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) return __nf_ct_resolve_clash(skb, h); } /* We want the clashing entry to go away real soon: 1 second timeout. */ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); /* IPS_NAT_CLASH removes the entry automatically on the first * reply. Also prevents UDP tracker from moving the entry to * ASSURED state, i.e. the entry can always be evicted under * pressure. */ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; __nf_conntrack_insert_prepare(loser_ct); /* fake add for ORIGINAL dir: we want lookups to only find the entry * already in the table. This also hides the clashing entry from * ctnetlink iteration, i.e. conntrack -L won't show them. */ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * * If there is one, @skb (and the assocated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * * The major problem with such packet drop is the extra delay added by * the packet loss -- it will take some time for a retransmit to occur * (or the sender to time out when waiting for a reply). * * This function attempts to handle the situation without packet drop. * * If @skb has no NAT transformation or if the colliding entries are * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; struct net *net; int ret; loser_ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(loser_ct); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->allow_clash) goto drop; ret = __nf_ct_resolve_clash(skb, h); if (ret == NF_ACCEPT) return ret; ret = nf_ct_resolve_clash_harder(skb, reply_hash); if (ret == NF_ACCEPT) return ret; drop: NF_CT_STAT_INC(net, drop); NF_CT_STAT_INC(net, insert_failed); return NF_DROP; } /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet which created connection will be IP_CT_NEW or for an expected connection, IP_CT_RELATED. */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; zone = nf_ct_zone(ct); local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and * REJECT will give spurious warnings here. */ /* Another skb with the same unconfirmed conntrack may * win the race. This may happen for bridge(br_flood) * or broadcast/multicast packets do skb_clone with * unconfirmed conntrack. */ if (unlikely(nf_ct_is_confirmed(ct))) { WARN_ON_ONCE(1); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_DROP; } if (!nf_ct_ext_valid_pre(ct->ext)) { NF_CT_STAT_INC(net, insert_failed); goto dying; } /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ ct->status |= IPS_CONFIRMED; if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; } max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) { chaintoolong: NF_CT_STAT_INC(net, chaintoolong); NF_CT_STAT_INC(net, insert_failed); ret = NF_DROP; goto dying; } } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); /* ext area is still valid (rcu read lock is held, * but will go out of scope soon, we need to remove * this conntrack again. */ if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); nf_conntrack_event_cache(master_ct(ct) ? IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); /* Returns true if a connection corresponds to the tuple (required for NAT). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct == ignored_conntrack) continue; if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) { /* Tuple is taken already, so caller will need to find * a new source port to use. * * Only exception: * If the *original tuples* are identical, then both * conntracks refer to the same flow. * This is a rare situation, it can occur e.g. when * more than one UDP packet is sent from same socket * in different threads. * * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; } } if (get_nulls_value(n) != hash) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } rcu_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); #define NF_CT_EVICTION_RANGE 8 /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static unsigned int early_drop_list(struct net *net, struct hlist_nulls_head *head) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int drops = 0; struct nf_conn *tmp; hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; } if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) continue; if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->ct_net and ->status after refcount increase */ smp_acquire__after_ctrl_dep(); /* kill only if still in same netns -- might have moved due to * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref * and move to next entry. */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && nf_ct_delete(tmp, 0, 0)) drops++; nf_ct_put(tmp); } return drops; } static noinline int early_drop(struct net *net, unsigned int hash) { unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); if (!i) bucket = reciprocal_scale(hash, hsize); else bucket = (bucket + 1) % hsize; drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); return true; } } return false; } static bool gc_worker_skip_ct(const struct nf_conn *ct) { return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); } static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; u8 protonum = nf_ct_protonum(ct); if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; return false; } static void gc_worker(struct work_struct *work) { unsigned int i, hashsz, nf_conntrack_max95 = 0; u32 end_time, start_time = nfct_time_stamp; struct conntrack_gc_work *gc_work; unsigned int expired_count = 0; unsigned long next_run; s32 delta_time; long count; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; if (i == 0) { gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; gc_work->count = GC_SCAN_INITIAL_COUNT; gc_work->start_time = start_time; } next_run = gc_work->avg_timeout; count = gc_work->count; end_time = start_time + GC_SCAN_MAX_DURATION; do { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; struct nf_conn *tmp; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); if (i >= hashsz) { rcu_read_unlock(); break; } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; struct net *net; long expires; tmp = nf_ct_tuplehash_to_ctrack(h); if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); if (!nf_conntrack_max95) continue; } if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); gc_work->next_bucket = i; gc_work->avg_timeout = next_run; gc_work->count = count; delta_time = nfct_time_stamp - gc_work->start_time; /* re-sched immediately if total cycle time is exceeded */ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; goto early_exit; } if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); expired_count++; continue; } expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); expires = (expires - (long)next_run) / ++count; next_run += expires; if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) continue; net = nf_ct_net(tmp); cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (gc_worker_skip_ct(tmp)) { nf_ct_put(tmp); continue; } if (gc_worker_can_early_drop(tmp)) { nf_ct_kill(tmp); expired_count++; } nf_ct_put(tmp); } /* could check get_nulls_value() here and restart if ct * was moved to another chain. But given gc is best-effort * we will just continue with next hash slot. */ rcu_read_unlock(); cond_resched(); i++; delta_time = nfct_time_stamp - end_time; if (delta_time > 0 && i < hashsz) { gc_work->avg_timeout = next_run; gc_work->count = count; gc_work->next_bucket = i; next_run = 0; goto early_exit; } } while (i < hashsz); gc_work->next_bucket = 0; next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); if (next_run > (unsigned long)delta_time) next_run -= delta_time; else next_run = 1; early_exit: if (gc_work->exiting) return; if (next_run) gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); gc_work->exiting = false; } static struct nf_conn * __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; atomic_dec(&cnet->count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } /* * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; WRITE_ONCE(ct->timeout, 0); write_pnet(&ct->ct_net, net); memset_after(ct, 0, __nfct_init_offset); nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) { return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ WARN_ON(refcount_read(&ct->ct_general.use) != 0); if (ct->status & IPS_SRC_NAT_DONE) { const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) nat_hook->remove_nat_bysrc(ct); rcu_read_unlock(); } kfree(ct->ext); kmem_cache_free(nf_conntrack_cachep, ct); cnet = nf_ct_pernet(net); smp_mb__before_atomic(); atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct sk_buff *skb, unsigned int dataoff, u32 hash) { struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; #endif struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC); nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_labels_ext_add(ct); #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; if ((ecache || net->ct.sysctl_events) && !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, ecache ? ecache->expmask : 0, GFP_ATOMIC)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } #endif cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } #ifdef CONFIG_NF_CONNTRACK_MARK ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; #endif NF_CT_STAT_INC(net, expect_new); } spin_unlock_bh(&nf_conntrack_expect_lock); } if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was * released. Because refcount is 0, refcount_inc_not_zero() will fail. * * After refcount_set(1) it will succeed; ensure that zeroing of * ct->status and the correct ct->net pointer are visible; else other * core might observe CONFIRMED bit which means the entry is valid and * in the hash table, but its not (anymore). */ smp_wmb(); /* Now it is going to be associated with an sk_buff, set refcount to 1. */ refcount_set(&ct->ct_general.use, 1); if (exp) { if (exp->expectfn) exp->expectfn(ct, exp); nf_ct_expect_put(exp); } return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns 0, sets skb->_nfct | ctinfo */ static int resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t protonum, const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; u32 hash, zone_id, rid; struct nf_conn *ct; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, &tuple)) return 0; /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (zone_id != rid) { u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); } } if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); if (!h) return 0; if (IS_ERR(h)) return PTR_ERR(h); } ct = nf_ct_tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { unsigned long status = READ_ONCE(ct->status); /* Once we've had two way comms, always ESTABLISHED. */ if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; else ctinfo = IP_CT_NEW; } nf_ct_set(skb, ct, ctinfo); return 0; } /* * icmp packets need special treatment to handle error messages that are * related to a connection. * * Callers need to check if skb has a conntrack assigned when this * helper returns; in such case skb belongs to an already known connection. */ static unsigned int __cold nf_conntrack_handle_icmp(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 protonum, const struct nf_hook_state *state) { int ret; if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); #if IS_ENABLED(CONFIG_IPV6) else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); #endif else return NF_ACCEPT; if (ret <= 0) NF_CT_STAT_INC_ATOMIC(state->net, error); return ret; } static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo) { const unsigned int *timeout = nf_ct_timeout_lookup(ct); if (!timeout) timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Returns verdict for packet, or -1 for invalid. */ static int nf_conntrack_handle_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: return nf_conntrack_tcp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_UDP: return nf_conntrack_udp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_ICMP: return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: return nf_conntrack_udplite_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: return nf_conntrack_dccp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, ctinfo, state); #endif } return generic_packet(ct, skb, ctinfo); } unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { enum ip_conntrack_info ctinfo; struct nf_conn *ct, *tmpl; u_int8_t protonum; int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, protonum, state); if (ret <= 0) { ret = -ret; goto out; } /* ICMP[v6] protocol trackers may assign one conntrack. */ if (skb->_nfct) goto out; } repeat: ret = resolve_normal_ct(tmpl, skb, dataoff, protonum, state); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = NF_DROP; goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. */ if (ret == -NF_REPEAT) goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; goto out; } if (ctinfo == IP_CT_ESTABLISHED_REPLY && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); out: if (tmpl) nf_ct_put(tmpl); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies, bool do_acct) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) goto acct; /* If not in hash table, timer will not be active yet */ if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb) { nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); return nf_ct_delete(ct, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/mutex.h> /* Generic function for tcp/udp/sctp/dccp and alike. */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) goto nla_put_failure; return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, }; EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { if (!tb[CTA_PROTO_SRC_PORT]) return -EINVAL; t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { if (!tb[CTA_PROTO_DST_PORT]) return -EINVAL; t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); } return 0; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); unsigned int nf_ct_port_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; /* This ICMP is in reverse direction to the packet which caused it */ ct = nf_ct_get(skb, &ctinfo); if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ctinfo = IP_CT_RELATED_REPLY; else ctinfo = IP_CT_RELATED; /* Attach to new skbuff, and increment count */ nf_ct_set(nskb, ct, ctinfo); nf_conntrack_get(skb_nfct(nskb)); } static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_nat_hook *nat_hook; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; unsigned int status; int dataoff; u16 l3num; u8 l4num; l3num = nf_ct_l3num(ct); dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); if (dataoff <= 0) return NF_DROP; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, l4num, net, &tuple)) return NF_DROP; if (ct->status & IPS_SRC_NAT) { memcpy(tuple.src.u3.all, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, sizeof(tuple.src.u3.all)); tuple.src.u.all = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; } if (ct->status & IPS_DST_NAT) { memcpy(tuple.dst.u3.all, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, sizeof(tuple.dst.u3.all)); tuple.dst.u.all = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; } h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); if (!h) return NF_ACCEPT; /* Store status bits of the conntrack that is clashing to re-do NAT * mangling according to what it has been done already to this packet. */ status = ct->status; nf_ct_put(ct); ct = nf_ct_tuplehash_to_ctrack(h); nf_ct_set(skb, ct, ctinfo); nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) return NF_ACCEPT; if (status & IPS_SRC_NAT) { unsigned int verdict = nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, IP_CT_DIR_ORIGINAL); if (verdict != NF_ACCEPT) return verdict; } if (status & IPS_DST_NAT) { unsigned int verdict = nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, IP_CT_DIR_ORIGINAL); if (verdict != NF_ACCEPT) return verdict; } return NF_ACCEPT; } /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; int protoff; help = nfct_help(ct); if (!help) return NF_ACCEPT; helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) return NF_ACCEPT; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: protoff = skb_network_offset(skb) + ip_hdrlen(skb); break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: { __be16 frag_off; u8 pnum; pnum = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return NF_ACCEPT; break; } #endif default: return NF_ACCEPT; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int ret = __nf_conntrack_update(net, skb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; } return nf_confirm_cthelper(skb, ct, ctinfo); } static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { const struct nf_conntrack_tuple *src_tuple; const struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple srctuple; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct) { src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); return true; } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), NFPROTO_IPV4, dev_net(skb->dev), &srctuple)) return false; hash = nf_conntrack_find_get(dev_net(skb->dev), &nf_ct_zone_dflt, &srctuple); if (!hash) return false; ct = nf_ct_tuplehash_to_ctrack(hash); src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); nf_ct_put(ct); return true; } /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; if (hlist_nulls_empty(hslot)) continue; lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); hlist_nulls_for_each_entry(h, n, hslot, hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) continue; /* All nf_conn objects are added to hash table twice, one * for original direction tuple, once for the reply tuple. * * Exception: In the IPS_NAT_CLASH case, only the reply * tuple is added (the original tuple already existed for * a different object). * * We only need to call the iterator once for each * conntrack, so we just use the 'reply' direction * tuple while iterating. */ ct = nf_ct_tuplehash_to_ctrack(h); if (iter_data->net && !net_eq(iter_data->net, nf_ct_net(ct))) continue; if (iter(ct, iter_data->data)) goto found; } spin_unlock(lockp); local_bh_enable(); cond_resched(); } return NULL; found: refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; } static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); mutex_lock(&nf_conntrack_mutex); while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { /* Time to push up daises... */ nf_ct_delete(ct, iter_data->portid, iter_data->report); nf_ct_put(ct); cond_resched(); } mutex_unlock(&nf_conntrack_mutex); } void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { struct net *net = iter_data->net; struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); if (atomic_read(&cnet->count) == 0) return; nf_ct_iterate_cleanup(iter, iter_data); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); /** * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table * @iter: callback to invoke for each conntrack * @data: data to pass to @iter * * Like nf_ct_iterate_cleanup, but first marks conntracks on the * unconfirmed list as dying (so they will not be inserted into * main table). * * Can only be called in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct nf_ct_iter_data iter_data = {}; struct net *net; down_read(&net_rwsem); for_each_net(net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from * the global list, so hook drop above might not have * affected all namespaces. */ net_ns_barrier(); /* a skb w. unconfirmed conntrack could have been reinjected just * before we called nf_queue_nf_hook_drop(). * * This makes sure its inserted into conntrack table. */ synchronize_net(); nf_ct_ext_bump_genid(); iter_data.data = data; nf_ct_iterate_cleanup(iter, &iter_data); /* Another cpu might be in a rcu read section with * rcu protected pointer cleared in iter callback * or hidden via nf_ct_ext_bump_genid() above. * * Wait until those are done. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); static int kill_all(struct nf_conn *i, void *data) { return 1; } void nf_conntrack_cleanup_start(void) { cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; } void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); nf_conntrack_helper_fini(); nf_conntrack_expect_fini(); kmem_cache_destroy(nf_conntrack_cachep); } /* * Mishearing the voices in his head, our hero wonders how he's * supposed to kill the mall. */ void nf_conntrack_cleanup_net(struct net *net) { LIST_HEAD(single); list_add(&net->exit_list, &single); nf_conntrack_cleanup_net_list(&single); } void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { struct nf_ct_iter_data iter_data = {}; struct net *net; int busy; /* * This makes sure all current packets have passed through * netfilter framework. Roll on, two-stage module * delete... */ synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); iter_data.net = net; nf_ct_iterate_cleanup_net(kill_all, &iter_data); if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { schedule(); goto i_see_dead_people; } list_for_each_entry(net, net_exit_list, exit_list) { nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); } } void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) for (i = 0; i < nr_slots; i++) INIT_HLIST_NULLS_HEAD(&hash[i], i); return hash; } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_hash_resize(unsigned int hashsize) { int i, bucket; unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; if (!hashsize) return -EINVAL; hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash * though since that required taking the locks. */ for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { unsigned int zone_id; h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } old_hash = nf_conntrack_hash; nf_conntrack_hash = hash; nf_conntrack_htable_size = hashsize; write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); mutex_unlock(&nf_conntrack_mutex); synchronize_net(); kvfree(old_hash); return 0; } int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { unsigned int hashsize; int rc; if (current->nsproxy->net_ns != &init_net) return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_hash) return param_set_uint(val, kp); rc = kstrtouint(val, 0, &hashsize); if (rc) return rc; return nf_conntrack_hash_resize(hashsize); } int nf_conntrack_init_start(void) { unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; seqcount_spinlock_init(&nf_conntrack_generation, &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); if (BITS_PER_LONG >= 64 && nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 65536; if (nf_conntrack_htable_size < 1024) nf_conntrack_htable_size = 1024; /* Use a max. factor of one by default to keep the average * hash chain length at 2 entries. Each entry has to be added * twice (once for original direction, once for reply). * When a table size is given we use the old value of 8 to * avoid implicit reduction of the max entries setting. */ max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); if (!nf_conntrack_hash) return -ENOMEM; nf_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; ret = nf_conntrack_expect_init(); if (ret < 0) goto err_expect; ret = nf_conntrack_helper_init(); if (ret < 0) goto err_helper; ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); ret = register_nf_conntrack_bpf(); if (ret < 0) goto err_kfunc; return 0; err_kfunc: cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_conntrack_proto_fini(); err_proto: nf_conntrack_helper_fini(); err_helper: nf_conntrack_expect_fini(); err_expect: kmem_cache_destroy(nf_conntrack_cachep); err_cachep: kvfree(nf_conntrack_hash); return ret; } static void nf_conntrack_set_closing(struct nf_conntrack *nfct) { struct nf_conn *ct = nf_ct_to_nf_conn(nfct); switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: nf_conntrack_tcp_set_closing(ct); break; } } static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, }; void nf_conntrack_init_end(void) { RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } /* * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) int nf_conntrack_init_net(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&cnet->count, 0); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) return ret; ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; err_expect: free_percpu(net->ct.stat); return ret; } /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) { if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) return -EPERM; __nf_ct_set_timeout(ct, timeout); if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; return 0; } EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) { unsigned int bit; /* Ignore these unchangable bits */ on &= ~IPS_UNCHANGEABLE_MASK; off &= ~IPS_UNCHANGEABLE_MASK; for (bit = 0; bit < __IPS_MAX_BIT; bit++) { if (on & (1 << bit)) set_bit(bit, &ct->status); else if (off & (1 << bit)) clear_bit(bit, &ct->status); } } EXPORT_SYMBOL_GPL(__nf_ct_change_status); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) { unsigned long d; d = ct->status ^ status; if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) /* unchangeable */ return -EBUSY; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; __nf_ct_change_status(ct, status, 0); return 0; } EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
743 743 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); }
49 40 43 50 6 3 5 50 6 39 1 5 23 3 5 4 2 6 7 1 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct features_req_info { struct ethnl_req_info base; }; struct features_reply_data { struct ethnl_reply_data base; u32 hw[ETHTOOL_DEV_FEATURE_WORDS]; u32 wanted[ETHTOOL_DEV_FEATURE_WORDS]; u32 active[ETHTOOL_DEV_FEATURE_WORDS]; u32 nochange[ETHTOOL_DEV_FEATURE_WORDS]; u32 all[ETHTOOL_DEV_FEATURE_WORDS]; }; #define FEATURES_REPDATA(__reply_base) \ container_of(__reply_base, struct features_reply_data, base) const struct nla_policy ethnl_features_get_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) { unsigned int i; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++) dest[i] = src >> (32 * i); } static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; netdev_features_t all_features; ethnl_features_to_bitmap32(data->hw, dev->hw_features); ethnl_features_to_bitmap32(data->wanted, dev->wanted_features); ethnl_features_to_bitmap32(data->active, dev->features); ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE); all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0); ethnl_features_to_bitmap32(data->all, all_features); return 0; } static int features_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; unsigned int len = 0; int ret; ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; return len; } static int features_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE, data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); } const struct ethnl_request_ops ethnl_features_request_ops = { .request_cmd = ETHTOOL_MSG_FEATURES_GET, .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY, .hdr_attr = ETHTOOL_A_FEATURES_HEADER, .req_info_size = sizeof(struct features_req_info), .reply_data_size = sizeof(struct features_reply_data), .prepare_data = features_prepare_data, .reply_size = features_reply_size, .fill_reply = features_fill_reply, }; /* FEATURES_SET */ const struct nla_policy ethnl_features_set_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED }, }; static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val) { const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); unsigned int i; for (i = 0; i < words; i++) dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG)); } static netdev_features_t ethnl_bitmap_to_features(unsigned long *src) { const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE; const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); netdev_features_t ret = 0; unsigned int i; for (i = 0; i < words; i++) ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG); ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT); return ret; } static int features_send_reply(struct net_device *dev, struct genl_info *info, const unsigned long *wanted, const unsigned long *wanted_mask, const unsigned long *active, const unsigned long *active_mask, bool compact) { struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; reply_len = ethnl_reply_header_size(); ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_A_FEATURES_HEADER, info, &reply_payload); if (!rskb) goto err; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); return ret; nla_put_failure: nlmsg_free(rskb); WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n", reply_len); err: GENL_SET_ERR_MSG(info, "failed to send reply message"); return ret; } int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) { DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT); struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod; int ret; if (!tb[ETHTOOL_A_FEATURES_WANTED]) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEATURES_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT); if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) { dev->wanted_features &= ~dev->hw_features; dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features; __netdev_update_features(dev); } ethnl_features_to_bitmap(new_active, dev->features); mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT); ret = 0; if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS; bitmap_xor(wanted_diff_mask, req_wanted, new_active, NETDEV_FEATURE_COUNT); bitmap_xor(active_diff_mask, old_active, new_active, NETDEV_FEATURE_COUNT); bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask, NETDEV_FEATURE_COUNT); bitmap_and(req_wanted, req_wanted, wanted_diff_mask, NETDEV_FEATURE_COUNT); bitmap_and(new_active, new_active, active_diff_mask, NETDEV_FEATURE_COUNT); ret = features_send_reply(dev, info, req_wanted, wanted_diff_mask, new_active, active_diff_mask, compact); } if (mod) netdev_features_change(dev); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; }
14 14 9 6 6 6 6 6 6 5 1 6 6 15 15 15 15 15 15 8 7 14 15 4 15 15 10 9 9 10 10 13 4 5 1 3 1 4 5 5 2 2 1 1 1 2 2 1 3 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 // SPDX-License-Identifier: GPL-2.0 /* * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2023 Intel Corporation */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/export.h> #include <net/cfg80211.h> #include "wext-compat.h" #include "nl80211.h" #include "rdev-ops.h" void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_bss *bss; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return; if (!wdev->u.ibss.ssid_len) return; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } cfg80211_hold_bss(bss_from_pub(bss)); wdev->u.ibss.current_bss = bss_from_pub(bss); cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); #endif } void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_ibss_joined(dev, bssid, channel); if (WARN_ON(!channel)) return; ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_IBSS_JOINED; memcpy(ev->ij.bssid, bssid, ETH_ALEN); ev->ij.channel = channel; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_ibss_joined); int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_held(&rdev->wiphy.mtx); if (wdev->u.ibss.ssid_len) return -EALREADY; if (!params->basic_rates) { /* * If no rates were explicitly configured, * use the mandatory rate set for 11b or * 11a for maximum compatibility. */ struct ieee80211_supported_band *sband; enum nl80211_band band; u32 flag; int j; band = params->chandef.chan->band; if (band == NL80211_BAND_5GHZ || band == NL80211_BAND_6GHZ) flag = IEEE80211_RATE_MANDATORY_A; else flag = IEEE80211_RATE_MANDATORY_B; sband = rdev->wiphy.bands[band]; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].flags & flag) params->basic_rates |= BIT(j); } } if (WARN_ON(connkeys && connkeys->def < 0)) return -EINVAL; if (WARN_ON(wdev->connect_keys)) kfree_sensitive(wdev->connect_keys); wdev->connect_keys = connkeys; wdev->u.ibss.chandef = params->chandef; if (connkeys) { params->wep_keys = connkeys->params; params->wep_tx_key = connkeys->def; } #ifdef CONFIG_CFG80211_WEXT wdev->wext.ibss.chandef = params->chandef; #endif err = rdev_join_ibss(rdev, dev, params); if (err) { wdev->connect_keys = NULL; return err; } memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len); wdev->u.ibss.ssid_len = params->ssid_len; return 0; } void cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; lockdep_assert_wiphy(wdev->wiphy); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; rdev_set_qos_map(rdev, dev, NULL); /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } wdev->u.ibss.current_bss = NULL; wdev->u.ibss.ssid_len = 0; memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef)); #ifdef CONFIG_CFG80211_WEXT if (!nowext) wdev->wext.ibss.ssid_len = 0; #endif cfg80211_sched_dfs_chan_update(rdev); } int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->u.ibss.ssid_len) return -ENOLINK; err = rdev_leave_ibss(rdev, dev); if (err) return err; wdev->conn_owner_nlportid = 0; cfg80211_clear_ibss(dev, nowext); return 0; } #ifdef CONFIG_CFG80211_WEXT int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; enum nl80211_band band; int i, err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->wext.ibss.beacon_interval) wdev->wext.ibss.beacon_interval = 100; /* try to find an IBSS channel if none requested ... */ if (!wdev->wext.ibss.chandef.chan) { struct ieee80211_channel *new_chan = NULL; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; sband = rdev->wiphy.bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; if (chan->flags & IEEE80211_CHAN_NO_IR) continue; if (chan->flags & IEEE80211_CHAN_DISABLED) continue; new_chan = chan; break; } if (new_chan) break; } if (!new_chan) return -EINVAL; cfg80211_chandef_create(&wdev->wext.ibss.chandef, new_chan, NL80211_CHAN_NO_HT); } /* don't join -- SSID is not there */ if (!wdev->wext.ibss.ssid_len) return 0; if (!netif_running(wdev->netdev)) return 0; if (wdev->wext.keys) wdev->wext.keys->def = wdev->wext.default_key; wdev->wext.ibss.privacy = wdev->wext.default_key != -1; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, &wdev->wext.ibss, ck); if (err) kfree(ck); return err; } int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_NO_IR || chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->wext.ibss.chandef.chan == chan) return 0; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; if (chan) { cfg80211_chandef_create(&wdev->wext.ibss.chandef, chan, NL80211_CHAN_NO_HT); wdev->wext.ibss.channel_fixed = true; } else { /* cfg80211_ibss_wext_join will pick one if needed */ wdev->wext.ibss.channel_fixed = false; } return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (wdev->u.ibss.current_bss) chan = wdev->u.ibss.current_bss->pub.channel; else if (wdev->wext.ibss.chandef.chan) chan = wdev->wext.ibss.chandef.chan; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_ibss_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; memcpy(wdev->u.ibss.ssid, ssid, len); wdev->wext.ibss.ssid = wdev->u.ibss.ssid; wdev->wext.ibss.ssid_len = len; return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; data->flags = 0; if (wdev->u.ibss.ssid_len) { data->flags = 1; data->length = wdev->u.ibss.ssid_len; memcpy(ssid, wdev->u.ibss.ssid, data->length); } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) { data->flags = 1; data->length = wdev->wext.ibss.ssid_len; memcpy(ssid, wdev->wext.ibss.ssid, data->length); } return 0; } int cfg80211_ibss_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (bssid && !is_valid_ether_addr(bssid)) return -EINVAL; /* both automatic */ if (!bssid && !wdev->wext.ibss.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.ibss.bssid && bssid && ether_addr_equal(bssid, wdev->wext.ibss.bssid)) return 0; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.ibss.bssid = wdev->wext.bssid; } else wdev->wext.ibss.bssid = NULL; return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->u.ibss.current_bss) memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid, ETH_ALEN); else if (wdev->wext.ibss.bssid) memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } #endif
9 9 9 8 7 7 7 5 5 5 7 2 8 1 7 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 /* * Cryptographic API. * * Whirlpool hashing Algorithm * * The Whirlpool algorithm was developed by Paulo S. L. M. Barreto and * Vincent Rijmen. It has been selected as one of cryptographic * primitives by the NESSIE project http://www.cryptonessie.org/ * * The original authors have disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * By Aaron Grothe ajgrothe@yahoo.com, August 23, 2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <linux/types.h> #define WP512_DIGEST_SIZE 64 #define WP384_DIGEST_SIZE 48 #define WP256_DIGEST_SIZE 32 #define WP512_BLOCK_SIZE 64 #define WP512_LENGTHBYTES 32 #define WHIRLPOOL_ROUNDS 10 struct wp512_ctx { u8 bitLength[WP512_LENGTHBYTES]; u8 buffer[WP512_BLOCK_SIZE]; int bufferBits; int bufferPos; u64 hash[WP512_DIGEST_SIZE/8]; }; /* * Though Whirlpool is endianness-neutral, the encryption tables are listed * in BIG-ENDIAN format, which is adopted throughout this implementation * (but little-endian notation would be equally suitable if consistently * employed). */ static const u64 C0[256] = { 0x18186018c07830d8ULL, 0x23238c2305af4626ULL, 0xc6c63fc67ef991b8ULL, 0xe8e887e8136fcdfbULL, 0x878726874ca113cbULL, 0xb8b8dab8a9626d11ULL, 0x0101040108050209ULL, 0x4f4f214f426e9e0dULL, 0x3636d836adee6c9bULL, 0xa6a6a2a6590451ffULL, 0xd2d26fd2debdb90cULL, 0xf5f5f3f5fb06f70eULL, 0x7979f979ef80f296ULL, 0x6f6fa16f5fcede30ULL, 0x91917e91fcef3f6dULL, 0x52525552aa07a4f8ULL, 0x60609d6027fdc047ULL, 0xbcbccabc89766535ULL, 0x9b9b569baccd2b37ULL, 0x8e8e028e048c018aULL, 0xa3a3b6a371155bd2ULL, 0x0c0c300c603c186cULL, 0x7b7bf17bff8af684ULL, 0x3535d435b5e16a80ULL, 0x1d1d741de8693af5ULL, 0xe0e0a7e05347ddb3ULL, 0xd7d77bd7f6acb321ULL, 0xc2c22fc25eed999cULL, 0x2e2eb82e6d965c43ULL, 0x4b4b314b627a9629ULL, 0xfefedffea321e15dULL, 0x575741578216aed5ULL, 0x15155415a8412abdULL, 0x7777c1779fb6eee8ULL, 0x3737dc37a5eb6e92ULL, 0xe5e5b3e57b56d79eULL, 0x9f9f469f8cd92313ULL, 0xf0f0e7f0d317fd23ULL, 0x4a4a354a6a7f9420ULL, 0xdada4fda9e95a944ULL, 0x58587d58fa25b0a2ULL, 0xc9c903c906ca8fcfULL, 0x2929a429558d527cULL, 0x0a0a280a5022145aULL, 0xb1b1feb1e14f7f50ULL, 0xa0a0baa0691a5dc9ULL, 0x6b6bb16b7fdad614ULL, 0x85852e855cab17d9ULL, 0xbdbdcebd8173673cULL, 0x5d5d695dd234ba8fULL, 0x1010401080502090ULL, 0xf4f4f7f4f303f507ULL, 0xcbcb0bcb16c08bddULL, 0x3e3ef83eedc67cd3ULL, 0x0505140528110a2dULL, 0x676781671fe6ce78ULL, 0xe4e4b7e47353d597ULL, 0x27279c2725bb4e02ULL, 0x4141194132588273ULL, 0x8b8b168b2c9d0ba7ULL, 0xa7a7a6a7510153f6ULL, 0x7d7de97dcf94fab2ULL, 0x95956e95dcfb3749ULL, 0xd8d847d88e9fad56ULL, 0xfbfbcbfb8b30eb70ULL, 0xeeee9fee2371c1cdULL, 0x7c7ced7cc791f8bbULL, 0x6666856617e3cc71ULL, 0xdddd53dda68ea77bULL, 0x17175c17b84b2eafULL, 0x4747014702468e45ULL, 0x9e9e429e84dc211aULL, 0xcaca0fca1ec589d4ULL, 0x2d2db42d75995a58ULL, 0xbfbfc6bf9179632eULL, 0x07071c07381b0e3fULL, 0xadad8ead012347acULL, 0x5a5a755aea2fb4b0ULL, 0x838336836cb51befULL, 0x3333cc3385ff66b6ULL, 0x636391633ff2c65cULL, 0x02020802100a0412ULL, 0xaaaa92aa39384993ULL, 0x7171d971afa8e2deULL, 0xc8c807c80ecf8dc6ULL, 0x19196419c87d32d1ULL, 0x494939497270923bULL, 0xd9d943d9869aaf5fULL, 0xf2f2eff2c31df931ULL, 0xe3e3abe34b48dba8ULL, 0x5b5b715be22ab6b9ULL, 0x88881a8834920dbcULL, 0x9a9a529aa4c8293eULL, 0x262698262dbe4c0bULL, 0x3232c8328dfa64bfULL, 0xb0b0fab0e94a7d59ULL, 0xe9e983e91b6acff2ULL, 0x0f0f3c0f78331e77ULL, 0xd5d573d5e6a6b733ULL, 0x80803a8074ba1df4ULL, 0xbebec2be997c6127ULL, 0xcdcd13cd26de87ebULL, 0x3434d034bde46889ULL, 0x48483d487a759032ULL, 0xffffdbffab24e354ULL, 0x7a7af57af78ff48dULL, 0x90907a90f4ea3d64ULL, 0x5f5f615fc23ebe9dULL, 0x202080201da0403dULL, 0x6868bd6867d5d00fULL, 0x1a1a681ad07234caULL, 0xaeae82ae192c41b7ULL, 0xb4b4eab4c95e757dULL, 0x54544d549a19a8ceULL, 0x93937693ece53b7fULL, 0x222288220daa442fULL, 0x64648d6407e9c863ULL, 0xf1f1e3f1db12ff2aULL, 0x7373d173bfa2e6ccULL, 0x12124812905a2482ULL, 0x40401d403a5d807aULL, 0x0808200840281048ULL, 0xc3c32bc356e89b95ULL, 0xecec97ec337bc5dfULL, 0xdbdb4bdb9690ab4dULL, 0xa1a1bea1611f5fc0ULL, 0x8d8d0e8d1c830791ULL, 0x3d3df43df5c97ac8ULL, 0x97976697ccf1335bULL, 0x0000000000000000ULL, 0xcfcf1bcf36d483f9ULL, 0x2b2bac2b4587566eULL, 0x7676c57697b3ece1ULL, 0x8282328264b019e6ULL, 0xd6d67fd6fea9b128ULL, 0x1b1b6c1bd87736c3ULL, 0xb5b5eeb5c15b7774ULL, 0xafaf86af112943beULL, 0x6a6ab56a77dfd41dULL, 0x50505d50ba0da0eaULL, 0x45450945124c8a57ULL, 0xf3f3ebf3cb18fb38ULL, 0x3030c0309df060adULL, 0xefef9bef2b74c3c4ULL, 0x3f3ffc3fe5c37edaULL, 0x55554955921caac7ULL, 0xa2a2b2a2791059dbULL, 0xeaea8fea0365c9e9ULL, 0x656589650fecca6aULL, 0xbabad2bab9686903ULL, 0x2f2fbc2f65935e4aULL, 0xc0c027c04ee79d8eULL, 0xdede5fdebe81a160ULL, 0x1c1c701ce06c38fcULL, 0xfdfdd3fdbb2ee746ULL, 0x4d4d294d52649a1fULL, 0x92927292e4e03976ULL, 0x7575c9758fbceafaULL, 0x06061806301e0c36ULL, 0x8a8a128a249809aeULL, 0xb2b2f2b2f940794bULL, 0xe6e6bfe66359d185ULL, 0x0e0e380e70361c7eULL, 0x1f1f7c1ff8633ee7ULL, 0x6262956237f7c455ULL, 0xd4d477d4eea3b53aULL, 0xa8a89aa829324d81ULL, 0x96966296c4f43152ULL, 0xf9f9c3f99b3aef62ULL, 0xc5c533c566f697a3ULL, 0x2525942535b14a10ULL, 0x59597959f220b2abULL, 0x84842a8454ae15d0ULL, 0x7272d572b7a7e4c5ULL, 0x3939e439d5dd72ecULL, 0x4c4c2d4c5a619816ULL, 0x5e5e655eca3bbc94ULL, 0x7878fd78e785f09fULL, 0x3838e038ddd870e5ULL, 0x8c8c0a8c14860598ULL, 0xd1d163d1c6b2bf17ULL, 0xa5a5aea5410b57e4ULL, 0xe2e2afe2434dd9a1ULL, 0x616199612ff8c24eULL, 0xb3b3f6b3f1457b42ULL, 0x2121842115a54234ULL, 0x9c9c4a9c94d62508ULL, 0x1e1e781ef0663ceeULL, 0x4343114322528661ULL, 0xc7c73bc776fc93b1ULL, 0xfcfcd7fcb32be54fULL, 0x0404100420140824ULL, 0x51515951b208a2e3ULL, 0x99995e99bcc72f25ULL, 0x6d6da96d4fc4da22ULL, 0x0d0d340d68391a65ULL, 0xfafacffa8335e979ULL, 0xdfdf5bdfb684a369ULL, 0x7e7ee57ed79bfca9ULL, 0x242490243db44819ULL, 0x3b3bec3bc5d776feULL, 0xabab96ab313d4b9aULL, 0xcece1fce3ed181f0ULL, 0x1111441188552299ULL, 0x8f8f068f0c890383ULL, 0x4e4e254e4a6b9c04ULL, 0xb7b7e6b7d1517366ULL, 0xebeb8beb0b60cbe0ULL, 0x3c3cf03cfdcc78c1ULL, 0x81813e817cbf1ffdULL, 0x94946a94d4fe3540ULL, 0xf7f7fbf7eb0cf31cULL, 0xb9b9deb9a1676f18ULL, 0x13134c13985f268bULL, 0x2c2cb02c7d9c5851ULL, 0xd3d36bd3d6b8bb05ULL, 0xe7e7bbe76b5cd38cULL, 0x6e6ea56e57cbdc39ULL, 0xc4c437c46ef395aaULL, 0x03030c03180f061bULL, 0x565645568a13acdcULL, 0x44440d441a49885eULL, 0x7f7fe17fdf9efea0ULL, 0xa9a99ea921374f88ULL, 0x2a2aa82a4d825467ULL, 0xbbbbd6bbb16d6b0aULL, 0xc1c123c146e29f87ULL, 0x53535153a202a6f1ULL, 0xdcdc57dcae8ba572ULL, 0x0b0b2c0b58271653ULL, 0x9d9d4e9d9cd32701ULL, 0x6c6cad6c47c1d82bULL, 0x3131c43195f562a4ULL, 0x7474cd7487b9e8f3ULL, 0xf6f6fff6e309f115ULL, 0x464605460a438c4cULL, 0xacac8aac092645a5ULL, 0x89891e893c970fb5ULL, 0x14145014a04428b4ULL, 0xe1e1a3e15b42dfbaULL, 0x16165816b04e2ca6ULL, 0x3a3ae83acdd274f7ULL, 0x6969b9696fd0d206ULL, 0x09092409482d1241ULL, 0x7070dd70a7ade0d7ULL, 0xb6b6e2b6d954716fULL, 0xd0d067d0ceb7bd1eULL, 0xeded93ed3b7ec7d6ULL, 0xcccc17cc2edb85e2ULL, 0x424215422a578468ULL, 0x98985a98b4c22d2cULL, 0xa4a4aaa4490e55edULL, 0x2828a0285d885075ULL, 0x5c5c6d5cda31b886ULL, 0xf8f8c7f8933fed6bULL, 0x8686228644a411c2ULL, }; static const u64 C1[256] = { 0xd818186018c07830ULL, 0x2623238c2305af46ULL, 0xb8c6c63fc67ef991ULL, 0xfbe8e887e8136fcdULL, 0xcb878726874ca113ULL, 0x11b8b8dab8a9626dULL, 0x0901010401080502ULL, 0x0d4f4f214f426e9eULL, 0x9b3636d836adee6cULL, 0xffa6a6a2a6590451ULL, 0x0cd2d26fd2debdb9ULL, 0x0ef5f5f3f5fb06f7ULL, 0x967979f979ef80f2ULL, 0x306f6fa16f5fcedeULL, 0x6d91917e91fcef3fULL, 0xf852525552aa07a4ULL, 0x4760609d6027fdc0ULL, 0x35bcbccabc897665ULL, 0x379b9b569baccd2bULL, 0x8a8e8e028e048c01ULL, 0xd2a3a3b6a371155bULL, 0x6c0c0c300c603c18ULL, 0x847b7bf17bff8af6ULL, 0x803535d435b5e16aULL, 0xf51d1d741de8693aULL, 0xb3e0e0a7e05347ddULL, 0x21d7d77bd7f6acb3ULL, 0x9cc2c22fc25eed99ULL, 0x432e2eb82e6d965cULL, 0x294b4b314b627a96ULL, 0x5dfefedffea321e1ULL, 0xd5575741578216aeULL, 0xbd15155415a8412aULL, 0xe87777c1779fb6eeULL, 0x923737dc37a5eb6eULL, 0x9ee5e5b3e57b56d7ULL, 0x139f9f469f8cd923ULL, 0x23f0f0e7f0d317fdULL, 0x204a4a354a6a7f94ULL, 0x44dada4fda9e95a9ULL, 0xa258587d58fa25b0ULL, 0xcfc9c903c906ca8fULL, 0x7c2929a429558d52ULL, 0x5a0a0a280a502214ULL, 0x50b1b1feb1e14f7fULL, 0xc9a0a0baa0691a5dULL, 0x146b6bb16b7fdad6ULL, 0xd985852e855cab17ULL, 0x3cbdbdcebd817367ULL, 0x8f5d5d695dd234baULL, 0x9010104010805020ULL, 0x07f4f4f7f4f303f5ULL, 0xddcbcb0bcb16c08bULL, 0xd33e3ef83eedc67cULL, 0x2d0505140528110aULL, 0x78676781671fe6ceULL, 0x97e4e4b7e47353d5ULL, 0x0227279c2725bb4eULL, 0x7341411941325882ULL, 0xa78b8b168b2c9d0bULL, 0xf6a7a7a6a7510153ULL, 0xb27d7de97dcf94faULL, 0x4995956e95dcfb37ULL, 0x56d8d847d88e9fadULL, 0x70fbfbcbfb8b30ebULL, 0xcdeeee9fee2371c1ULL, 0xbb7c7ced7cc791f8ULL, 0x716666856617e3ccULL, 0x7bdddd53dda68ea7ULL, 0xaf17175c17b84b2eULL, 0x454747014702468eULL, 0x1a9e9e429e84dc21ULL, 0xd4caca0fca1ec589ULL, 0x582d2db42d75995aULL, 0x2ebfbfc6bf917963ULL, 0x3f07071c07381b0eULL, 0xacadad8ead012347ULL, 0xb05a5a755aea2fb4ULL, 0xef838336836cb51bULL, 0xb63333cc3385ff66ULL, 0x5c636391633ff2c6ULL, 0x1202020802100a04ULL, 0x93aaaa92aa393849ULL, 0xde7171d971afa8e2ULL, 0xc6c8c807c80ecf8dULL, 0xd119196419c87d32ULL, 0x3b49493949727092ULL, 0x5fd9d943d9869aafULL, 0x31f2f2eff2c31df9ULL, 0xa8e3e3abe34b48dbULL, 0xb95b5b715be22ab6ULL, 0xbc88881a8834920dULL, 0x3e9a9a529aa4c829ULL, 0x0b262698262dbe4cULL, 0xbf3232c8328dfa64ULL, 0x59b0b0fab0e94a7dULL, 0xf2e9e983e91b6acfULL, 0x770f0f3c0f78331eULL, 0x33d5d573d5e6a6b7ULL, 0xf480803a8074ba1dULL, 0x27bebec2be997c61ULL, 0xebcdcd13cd26de87ULL, 0x893434d034bde468ULL, 0x3248483d487a7590ULL, 0x54ffffdbffab24e3ULL, 0x8d7a7af57af78ff4ULL, 0x6490907a90f4ea3dULL, 0x9d5f5f615fc23ebeULL, 0x3d202080201da040ULL, 0x0f6868bd6867d5d0ULL, 0xca1a1a681ad07234ULL, 0xb7aeae82ae192c41ULL, 0x7db4b4eab4c95e75ULL, 0xce54544d549a19a8ULL, 0x7f93937693ece53bULL, 0x2f222288220daa44ULL, 0x6364648d6407e9c8ULL, 0x2af1f1e3f1db12ffULL, 0xcc7373d173bfa2e6ULL, 0x8212124812905a24ULL, 0x7a40401d403a5d80ULL, 0x4808082008402810ULL, 0x95c3c32bc356e89bULL, 0xdfecec97ec337bc5ULL, 0x4ddbdb4bdb9690abULL, 0xc0a1a1bea1611f5fULL, 0x918d8d0e8d1c8307ULL, 0xc83d3df43df5c97aULL, 0x5b97976697ccf133ULL, 0x0000000000000000ULL, 0xf9cfcf1bcf36d483ULL, 0x6e2b2bac2b458756ULL, 0xe17676c57697b3ecULL, 0xe68282328264b019ULL, 0x28d6d67fd6fea9b1ULL, 0xc31b1b6c1bd87736ULL, 0x74b5b5eeb5c15b77ULL, 0xbeafaf86af112943ULL, 0x1d6a6ab56a77dfd4ULL, 0xea50505d50ba0da0ULL, 0x5745450945124c8aULL, 0x38f3f3ebf3cb18fbULL, 0xad3030c0309df060ULL, 0xc4efef9bef2b74c3ULL, 0xda3f3ffc3fe5c37eULL, 0xc755554955921caaULL, 0xdba2a2b2a2791059ULL, 0xe9eaea8fea0365c9ULL, 0x6a656589650feccaULL, 0x03babad2bab96869ULL, 0x4a2f2fbc2f65935eULL, 0x8ec0c027c04ee79dULL, 0x60dede5fdebe81a1ULL, 0xfc1c1c701ce06c38ULL, 0x46fdfdd3fdbb2ee7ULL, 0x1f4d4d294d52649aULL, 0x7692927292e4e039ULL, 0xfa7575c9758fbceaULL, 0x3606061806301e0cULL, 0xae8a8a128a249809ULL, 0x4bb2b2f2b2f94079ULL, 0x85e6e6bfe66359d1ULL, 0x7e0e0e380e70361cULL, 0xe71f1f7c1ff8633eULL, 0x556262956237f7c4ULL, 0x3ad4d477d4eea3b5ULL, 0x81a8a89aa829324dULL, 0x5296966296c4f431ULL, 0x62f9f9c3f99b3aefULL, 0xa3c5c533c566f697ULL, 0x102525942535b14aULL, 0xab59597959f220b2ULL, 0xd084842a8454ae15ULL, 0xc57272d572b7a7e4ULL, 0xec3939e439d5dd72ULL, 0x164c4c2d4c5a6198ULL, 0x945e5e655eca3bbcULL, 0x9f7878fd78e785f0ULL, 0xe53838e038ddd870ULL, 0x988c8c0a8c148605ULL, 0x17d1d163d1c6b2bfULL, 0xe4a5a5aea5410b57ULL, 0xa1e2e2afe2434dd9ULL, 0x4e616199612ff8c2ULL, 0x42b3b3f6b3f1457bULL, 0x342121842115a542ULL, 0x089c9c4a9c94d625ULL, 0xee1e1e781ef0663cULL, 0x6143431143225286ULL, 0xb1c7c73bc776fc93ULL, 0x4ffcfcd7fcb32be5ULL, 0x2404041004201408ULL, 0xe351515951b208a2ULL, 0x2599995e99bcc72fULL, 0x226d6da96d4fc4daULL, 0x650d0d340d68391aULL, 0x79fafacffa8335e9ULL, 0x69dfdf5bdfb684a3ULL, 0xa97e7ee57ed79bfcULL, 0x19242490243db448ULL, 0xfe3b3bec3bc5d776ULL, 0x9aabab96ab313d4bULL, 0xf0cece1fce3ed181ULL, 0x9911114411885522ULL, 0x838f8f068f0c8903ULL, 0x044e4e254e4a6b9cULL, 0x66b7b7e6b7d15173ULL, 0xe0ebeb8beb0b60cbULL, 0xc13c3cf03cfdcc78ULL, 0xfd81813e817cbf1fULL, 0x4094946a94d4fe35ULL, 0x1cf7f7fbf7eb0cf3ULL, 0x18b9b9deb9a1676fULL, 0x8b13134c13985f26ULL, 0x512c2cb02c7d9c58ULL, 0x05d3d36bd3d6b8bbULL, 0x8ce7e7bbe76b5cd3ULL, 0x396e6ea56e57cbdcULL, 0xaac4c437c46ef395ULL, 0x1b03030c03180f06ULL, 0xdc565645568a13acULL, 0x5e44440d441a4988ULL, 0xa07f7fe17fdf9efeULL, 0x88a9a99ea921374fULL, 0x672a2aa82a4d8254ULL, 0x0abbbbd6bbb16d6bULL, 0x87c1c123c146e29fULL, 0xf153535153a202a6ULL, 0x72dcdc57dcae8ba5ULL, 0x530b0b2c0b582716ULL, 0x019d9d4e9d9cd327ULL, 0x2b6c6cad6c47c1d8ULL, 0xa43131c43195f562ULL, 0xf37474cd7487b9e8ULL, 0x15f6f6fff6e309f1ULL, 0x4c464605460a438cULL, 0xa5acac8aac092645ULL, 0xb589891e893c970fULL, 0xb414145014a04428ULL, 0xbae1e1a3e15b42dfULL, 0xa616165816b04e2cULL, 0xf73a3ae83acdd274ULL, 0x066969b9696fd0d2ULL, 0x4109092409482d12ULL, 0xd77070dd70a7ade0ULL, 0x6fb6b6e2b6d95471ULL, 0x1ed0d067d0ceb7bdULL, 0xd6eded93ed3b7ec7ULL, 0xe2cccc17cc2edb85ULL, 0x68424215422a5784ULL, 0x2c98985a98b4c22dULL, 0xeda4a4aaa4490e55ULL, 0x752828a0285d8850ULL, 0x865c5c6d5cda31b8ULL, 0x6bf8f8c7f8933fedULL, 0xc28686228644a411ULL, }; static const u64 C2[256] = { 0x30d818186018c078ULL, 0x462623238c2305afULL, 0x91b8c6c63fc67ef9ULL, 0xcdfbe8e887e8136fULL, 0x13cb878726874ca1ULL, 0x6d11b8b8dab8a962ULL, 0x0209010104010805ULL, 0x9e0d4f4f214f426eULL, 0x6c9b3636d836adeeULL, 0x51ffa6a6a2a65904ULL, 0xb90cd2d26fd2debdULL, 0xf70ef5f5f3f5fb06ULL, 0xf2967979f979ef80ULL, 0xde306f6fa16f5fceULL, 0x3f6d91917e91fcefULL, 0xa4f852525552aa07ULL, 0xc04760609d6027fdULL, 0x6535bcbccabc8976ULL, 0x2b379b9b569baccdULL, 0x018a8e8e028e048cULL, 0x5bd2a3a3b6a37115ULL, 0x186c0c0c300c603cULL, 0xf6847b7bf17bff8aULL, 0x6a803535d435b5e1ULL, 0x3af51d1d741de869ULL, 0xddb3e0e0a7e05347ULL, 0xb321d7d77bd7f6acULL, 0x999cc2c22fc25eedULL, 0x5c432e2eb82e6d96ULL, 0x96294b4b314b627aULL, 0xe15dfefedffea321ULL, 0xaed5575741578216ULL, 0x2abd15155415a841ULL, 0xeee87777c1779fb6ULL, 0x6e923737dc37a5ebULL, 0xd79ee5e5b3e57b56ULL, 0x23139f9f469f8cd9ULL, 0xfd23f0f0e7f0d317ULL, 0x94204a4a354a6a7fULL, 0xa944dada4fda9e95ULL, 0xb0a258587d58fa25ULL, 0x8fcfc9c903c906caULL, 0x527c2929a429558dULL, 0x145a0a0a280a5022ULL, 0x7f50b1b1feb1e14fULL, 0x5dc9a0a0baa0691aULL, 0xd6146b6bb16b7fdaULL, 0x17d985852e855cabULL, 0x673cbdbdcebd8173ULL, 0xba8f5d5d695dd234ULL, 0x2090101040108050ULL, 0xf507f4f4f7f4f303ULL, 0x8bddcbcb0bcb16c0ULL, 0x7cd33e3ef83eedc6ULL, 0x0a2d050514052811ULL, 0xce78676781671fe6ULL, 0xd597e4e4b7e47353ULL, 0x4e0227279c2725bbULL, 0x8273414119413258ULL, 0x0ba78b8b168b2c9dULL, 0x53f6a7a7a6a75101ULL, 0xfab27d7de97dcf94ULL, 0x374995956e95dcfbULL, 0xad56d8d847d88e9fULL, 0xeb70fbfbcbfb8b30ULL, 0xc1cdeeee9fee2371ULL, 0xf8bb7c7ced7cc791ULL, 0xcc716666856617e3ULL, 0xa77bdddd53dda68eULL, 0x2eaf17175c17b84bULL, 0x8e45474701470246ULL, 0x211a9e9e429e84dcULL, 0x89d4caca0fca1ec5ULL, 0x5a582d2db42d7599ULL, 0x632ebfbfc6bf9179ULL, 0x0e3f07071c07381bULL, 0x47acadad8ead0123ULL, 0xb4b05a5a755aea2fULL, 0x1bef838336836cb5ULL, 0x66b63333cc3385ffULL, 0xc65c636391633ff2ULL, 0x041202020802100aULL, 0x4993aaaa92aa3938ULL, 0xe2de7171d971afa8ULL, 0x8dc6c8c807c80ecfULL, 0x32d119196419c87dULL, 0x923b494939497270ULL, 0xaf5fd9d943d9869aULL, 0xf931f2f2eff2c31dULL, 0xdba8e3e3abe34b48ULL, 0xb6b95b5b715be22aULL, 0x0dbc88881a883492ULL, 0x293e9a9a529aa4c8ULL, 0x4c0b262698262dbeULL, 0x64bf3232c8328dfaULL, 0x7d59b0b0fab0e94aULL, 0xcff2e9e983e91b6aULL, 0x1e770f0f3c0f7833ULL, 0xb733d5d573d5e6a6ULL, 0x1df480803a8074baULL, 0x6127bebec2be997cULL, 0x87ebcdcd13cd26deULL, 0x68893434d034bde4ULL, 0x903248483d487a75ULL, 0xe354ffffdbffab24ULL, 0xf48d7a7af57af78fULL, 0x3d6490907a90f4eaULL, 0xbe9d5f5f615fc23eULL, 0x403d202080201da0ULL, 0xd00f6868bd6867d5ULL, 0x34ca1a1a681ad072ULL, 0x41b7aeae82ae192cULL, 0x757db4b4eab4c95eULL, 0xa8ce54544d549a19ULL, 0x3b7f93937693ece5ULL, 0x442f222288220daaULL, 0xc86364648d6407e9ULL, 0xff2af1f1e3f1db12ULL, 0xe6cc7373d173bfa2ULL, 0x248212124812905aULL, 0x807a40401d403a5dULL, 0x1048080820084028ULL, 0x9b95c3c32bc356e8ULL, 0xc5dfecec97ec337bULL, 0xab4ddbdb4bdb9690ULL, 0x5fc0a1a1bea1611fULL, 0x07918d8d0e8d1c83ULL, 0x7ac83d3df43df5c9ULL, 0x335b97976697ccf1ULL, 0x0000000000000000ULL, 0x83f9cfcf1bcf36d4ULL, 0x566e2b2bac2b4587ULL, 0xece17676c57697b3ULL, 0x19e68282328264b0ULL, 0xb128d6d67fd6fea9ULL, 0x36c31b1b6c1bd877ULL, 0x7774b5b5eeb5c15bULL, 0x43beafaf86af1129ULL, 0xd41d6a6ab56a77dfULL, 0xa0ea50505d50ba0dULL, 0x8a5745450945124cULL, 0xfb38f3f3ebf3cb18ULL, 0x60ad3030c0309df0ULL, 0xc3c4efef9bef2b74ULL, 0x7eda3f3ffc3fe5c3ULL, 0xaac755554955921cULL, 0x59dba2a2b2a27910ULL, 0xc9e9eaea8fea0365ULL, 0xca6a656589650fecULL, 0x6903babad2bab968ULL, 0x5e4a2f2fbc2f6593ULL, 0x9d8ec0c027c04ee7ULL, 0xa160dede5fdebe81ULL, 0x38fc1c1c701ce06cULL, 0xe746fdfdd3fdbb2eULL, 0x9a1f4d4d294d5264ULL, 0x397692927292e4e0ULL, 0xeafa7575c9758fbcULL, 0x0c3606061806301eULL, 0x09ae8a8a128a2498ULL, 0x794bb2b2f2b2f940ULL, 0xd185e6e6bfe66359ULL, 0x1c7e0e0e380e7036ULL, 0x3ee71f1f7c1ff863ULL, 0xc4556262956237f7ULL, 0xb53ad4d477d4eea3ULL, 0x4d81a8a89aa82932ULL, 0x315296966296c4f4ULL, 0xef62f9f9c3f99b3aULL, 0x97a3c5c533c566f6ULL, 0x4a102525942535b1ULL, 0xb2ab59597959f220ULL, 0x15d084842a8454aeULL, 0xe4c57272d572b7a7ULL, 0x72ec3939e439d5ddULL, 0x98164c4c2d4c5a61ULL, 0xbc945e5e655eca3bULL, 0xf09f7878fd78e785ULL, 0x70e53838e038ddd8ULL, 0x05988c8c0a8c1486ULL, 0xbf17d1d163d1c6b2ULL, 0x57e4a5a5aea5410bULL, 0xd9a1e2e2afe2434dULL, 0xc24e616199612ff8ULL, 0x7b42b3b3f6b3f145ULL, 0x42342121842115a5ULL, 0x25089c9c4a9c94d6ULL, 0x3cee1e1e781ef066ULL, 0x8661434311432252ULL, 0x93b1c7c73bc776fcULL, 0xe54ffcfcd7fcb32bULL, 0x0824040410042014ULL, 0xa2e351515951b208ULL, 0x2f2599995e99bcc7ULL, 0xda226d6da96d4fc4ULL, 0x1a650d0d340d6839ULL, 0xe979fafacffa8335ULL, 0xa369dfdf5bdfb684ULL, 0xfca97e7ee57ed79bULL, 0x4819242490243db4ULL, 0x76fe3b3bec3bc5d7ULL, 0x4b9aabab96ab313dULL, 0x81f0cece1fce3ed1ULL, 0x2299111144118855ULL, 0x03838f8f068f0c89ULL, 0x9c044e4e254e4a6bULL, 0x7366b7b7e6b7d151ULL, 0xcbe0ebeb8beb0b60ULL, 0x78c13c3cf03cfdccULL, 0x1ffd81813e817cbfULL, 0x354094946a94d4feULL, 0xf31cf7f7fbf7eb0cULL, 0x6f18b9b9deb9a167ULL, 0x268b13134c13985fULL, 0x58512c2cb02c7d9cULL, 0xbb05d3d36bd3d6b8ULL, 0xd38ce7e7bbe76b5cULL, 0xdc396e6ea56e57cbULL, 0x95aac4c437c46ef3ULL, 0x061b03030c03180fULL, 0xacdc565645568a13ULL, 0x885e44440d441a49ULL, 0xfea07f7fe17fdf9eULL, 0x4f88a9a99ea92137ULL, 0x54672a2aa82a4d82ULL, 0x6b0abbbbd6bbb16dULL, 0x9f87c1c123c146e2ULL, 0xa6f153535153a202ULL, 0xa572dcdc57dcae8bULL, 0x16530b0b2c0b5827ULL, 0x27019d9d4e9d9cd3ULL, 0xd82b6c6cad6c47c1ULL, 0x62a43131c43195f5ULL, 0xe8f37474cd7487b9ULL, 0xf115f6f6fff6e309ULL, 0x8c4c464605460a43ULL, 0x45a5acac8aac0926ULL, 0x0fb589891e893c97ULL, 0x28b414145014a044ULL, 0xdfbae1e1a3e15b42ULL, 0x2ca616165816b04eULL, 0x74f73a3ae83acdd2ULL, 0xd2066969b9696fd0ULL, 0x124109092409482dULL, 0xe0d77070dd70a7adULL, 0x716fb6b6e2b6d954ULL, 0xbd1ed0d067d0ceb7ULL, 0xc7d6eded93ed3b7eULL, 0x85e2cccc17cc2edbULL, 0x8468424215422a57ULL, 0x2d2c98985a98b4c2ULL, 0x55eda4a4aaa4490eULL, 0x50752828a0285d88ULL, 0xb8865c5c6d5cda31ULL, 0xed6bf8f8c7f8933fULL, 0x11c28686228644a4ULL, }; static const u64 C3[256] = { 0x7830d818186018c0ULL, 0xaf462623238c2305ULL, 0xf991b8c6c63fc67eULL, 0x6fcdfbe8e887e813ULL, 0xa113cb878726874cULL, 0x626d11b8b8dab8a9ULL, 0x0502090101040108ULL, 0x6e9e0d4f4f214f42ULL, 0xee6c9b3636d836adULL, 0x0451ffa6a6a2a659ULL, 0xbdb90cd2d26fd2deULL, 0x06f70ef5f5f3f5fbULL, 0x80f2967979f979efULL, 0xcede306f6fa16f5fULL, 0xef3f6d91917e91fcULL, 0x07a4f852525552aaULL, 0xfdc04760609d6027ULL, 0x766535bcbccabc89ULL, 0xcd2b379b9b569bacULL, 0x8c018a8e8e028e04ULL, 0x155bd2a3a3b6a371ULL, 0x3c186c0c0c300c60ULL, 0x8af6847b7bf17bffULL, 0xe16a803535d435b5ULL, 0x693af51d1d741de8ULL, 0x47ddb3e0e0a7e053ULL, 0xacb321d7d77bd7f6ULL, 0xed999cc2c22fc25eULL, 0x965c432e2eb82e6dULL, 0x7a96294b4b314b62ULL, 0x21e15dfefedffea3ULL, 0x16aed55757415782ULL, 0x412abd15155415a8ULL, 0xb6eee87777c1779fULL, 0xeb6e923737dc37a5ULL, 0x56d79ee5e5b3e57bULL, 0xd923139f9f469f8cULL, 0x17fd23f0f0e7f0d3ULL, 0x7f94204a4a354a6aULL, 0x95a944dada4fda9eULL, 0x25b0a258587d58faULL, 0xca8fcfc9c903c906ULL, 0x8d527c2929a42955ULL, 0x22145a0a0a280a50ULL, 0x4f7f50b1b1feb1e1ULL, 0x1a5dc9a0a0baa069ULL, 0xdad6146b6bb16b7fULL, 0xab17d985852e855cULL, 0x73673cbdbdcebd81ULL, 0x34ba8f5d5d695dd2ULL, 0x5020901010401080ULL, 0x03f507f4f4f7f4f3ULL, 0xc08bddcbcb0bcb16ULL, 0xc67cd33e3ef83eedULL, 0x110a2d0505140528ULL, 0xe6ce78676781671fULL, 0x53d597e4e4b7e473ULL, 0xbb4e0227279c2725ULL, 0x5882734141194132ULL, 0x9d0ba78b8b168b2cULL, 0x0153f6a7a7a6a751ULL, 0x94fab27d7de97dcfULL, 0xfb374995956e95dcULL, 0x9fad56d8d847d88eULL, 0x30eb70fbfbcbfb8bULL, 0x71c1cdeeee9fee23ULL, 0x91f8bb7c7ced7cc7ULL, 0xe3cc716666856617ULL, 0x8ea77bdddd53dda6ULL, 0x4b2eaf17175c17b8ULL, 0x468e454747014702ULL, 0xdc211a9e9e429e84ULL, 0xc589d4caca0fca1eULL, 0x995a582d2db42d75ULL, 0x79632ebfbfc6bf91ULL, 0x1b0e3f07071c0738ULL, 0x2347acadad8ead01ULL, 0x2fb4b05a5a755aeaULL, 0xb51bef838336836cULL, 0xff66b63333cc3385ULL, 0xf2c65c636391633fULL, 0x0a04120202080210ULL, 0x384993aaaa92aa39ULL, 0xa8e2de7171d971afULL, 0xcf8dc6c8c807c80eULL, 0x7d32d119196419c8ULL, 0x70923b4949394972ULL, 0x9aaf5fd9d943d986ULL, 0x1df931f2f2eff2c3ULL, 0x48dba8e3e3abe34bULL, 0x2ab6b95b5b715be2ULL, 0x920dbc88881a8834ULL, 0xc8293e9a9a529aa4ULL, 0xbe4c0b262698262dULL, 0xfa64bf3232c8328dULL, 0x4a7d59b0b0fab0e9ULL, 0x6acff2e9e983e91bULL, 0x331e770f0f3c0f78ULL, 0xa6b733d5d573d5e6ULL, 0xba1df480803a8074ULL, 0x7c6127bebec2be99ULL, 0xde87ebcdcd13cd26ULL, 0xe468893434d034bdULL, 0x75903248483d487aULL, 0x24e354ffffdbffabULL, 0x8ff48d7a7af57af7ULL, 0xea3d6490907a90f4ULL, 0x3ebe9d5f5f615fc2ULL, 0xa0403d202080201dULL, 0xd5d00f6868bd6867ULL, 0x7234ca1a1a681ad0ULL, 0x2c41b7aeae82ae19ULL, 0x5e757db4b4eab4c9ULL, 0x19a8ce54544d549aULL, 0xe53b7f93937693ecULL, 0xaa442f222288220dULL, 0xe9c86364648d6407ULL, 0x12ff2af1f1e3f1dbULL, 0xa2e6cc7373d173bfULL, 0x5a24821212481290ULL, 0x5d807a40401d403aULL, 0x2810480808200840ULL, 0xe89b95c3c32bc356ULL, 0x7bc5dfecec97ec33ULL, 0x90ab4ddbdb4bdb96ULL, 0x1f5fc0a1a1bea161ULL, 0x8307918d8d0e8d1cULL, 0xc97ac83d3df43df5ULL, 0xf1335b97976697ccULL, 0x0000000000000000ULL, 0xd483f9cfcf1bcf36ULL, 0x87566e2b2bac2b45ULL, 0xb3ece17676c57697ULL, 0xb019e68282328264ULL, 0xa9b128d6d67fd6feULL, 0x7736c31b1b6c1bd8ULL, 0x5b7774b5b5eeb5c1ULL, 0x2943beafaf86af11ULL, 0xdfd41d6a6ab56a77ULL, 0x0da0ea50505d50baULL, 0x4c8a574545094512ULL, 0x18fb38f3f3ebf3cbULL, 0xf060ad3030c0309dULL, 0x74c3c4efef9bef2bULL, 0xc37eda3f3ffc3fe5ULL, 0x1caac75555495592ULL, 0x1059dba2a2b2a279ULL, 0x65c9e9eaea8fea03ULL, 0xecca6a656589650fULL, 0x686903babad2bab9ULL, 0x935e4a2f2fbc2f65ULL, 0xe79d8ec0c027c04eULL, 0x81a160dede5fdebeULL, 0x6c38fc1c1c701ce0ULL, 0x2ee746fdfdd3fdbbULL, 0x649a1f4d4d294d52ULL, 0xe0397692927292e4ULL, 0xbceafa7575c9758fULL, 0x1e0c360606180630ULL, 0x9809ae8a8a128a24ULL, 0x40794bb2b2f2b2f9ULL, 0x59d185e6e6bfe663ULL, 0x361c7e0e0e380e70ULL, 0x633ee71f1f7c1ff8ULL, 0xf7c4556262956237ULL, 0xa3b53ad4d477d4eeULL, 0x324d81a8a89aa829ULL, 0xf4315296966296c4ULL, 0x3aef62f9f9c3f99bULL, 0xf697a3c5c533c566ULL, 0xb14a102525942535ULL, 0x20b2ab59597959f2ULL, 0xae15d084842a8454ULL, 0xa7e4c57272d572b7ULL, 0xdd72ec3939e439d5ULL, 0x6198164c4c2d4c5aULL, 0x3bbc945e5e655ecaULL, 0x85f09f7878fd78e7ULL, 0xd870e53838e038ddULL, 0x8605988c8c0a8c14ULL, 0xb2bf17d1d163d1c6ULL, 0x0b57e4a5a5aea541ULL, 0x4dd9a1e2e2afe243ULL, 0xf8c24e616199612fULL, 0x457b42b3b3f6b3f1ULL, 0xa542342121842115ULL, 0xd625089c9c4a9c94ULL, 0x663cee1e1e781ef0ULL, 0x5286614343114322ULL, 0xfc93b1c7c73bc776ULL, 0x2be54ffcfcd7fcb3ULL, 0x1408240404100420ULL, 0x08a2e351515951b2ULL, 0xc72f2599995e99bcULL, 0xc4da226d6da96d4fULL, 0x391a650d0d340d68ULL, 0x35e979fafacffa83ULL, 0x84a369dfdf5bdfb6ULL, 0x9bfca97e7ee57ed7ULL, 0xb44819242490243dULL, 0xd776fe3b3bec3bc5ULL, 0x3d4b9aabab96ab31ULL, 0xd181f0cece1fce3eULL, 0x5522991111441188ULL, 0x8903838f8f068f0cULL, 0x6b9c044e4e254e4aULL, 0x517366b7b7e6b7d1ULL, 0x60cbe0ebeb8beb0bULL, 0xcc78c13c3cf03cfdULL, 0xbf1ffd81813e817cULL, 0xfe354094946a94d4ULL, 0x0cf31cf7f7fbf7ebULL, 0x676f18b9b9deb9a1ULL, 0x5f268b13134c1398ULL, 0x9c58512c2cb02c7dULL, 0xb8bb05d3d36bd3d6ULL, 0x5cd38ce7e7bbe76bULL, 0xcbdc396e6ea56e57ULL, 0xf395aac4c437c46eULL, 0x0f061b03030c0318ULL, 0x13acdc565645568aULL, 0x49885e44440d441aULL, 0x9efea07f7fe17fdfULL, 0x374f88a9a99ea921ULL, 0x8254672a2aa82a4dULL, 0x6d6b0abbbbd6bbb1ULL, 0xe29f87c1c123c146ULL, 0x02a6f153535153a2ULL, 0x8ba572dcdc57dcaeULL, 0x2716530b0b2c0b58ULL, 0xd327019d9d4e9d9cULL, 0xc1d82b6c6cad6c47ULL, 0xf562a43131c43195ULL, 0xb9e8f37474cd7487ULL, 0x09f115f6f6fff6e3ULL, 0x438c4c464605460aULL, 0x2645a5acac8aac09ULL, 0x970fb589891e893cULL, 0x4428b414145014a0ULL, 0x42dfbae1e1a3e15bULL, 0x4e2ca616165816b0ULL, 0xd274f73a3ae83acdULL, 0xd0d2066969b9696fULL, 0x2d12410909240948ULL, 0xade0d77070dd70a7ULL, 0x54716fb6b6e2b6d9ULL, 0xb7bd1ed0d067d0ceULL, 0x7ec7d6eded93ed3bULL, 0xdb85e2cccc17cc2eULL, 0x578468424215422aULL, 0xc22d2c98985a98b4ULL, 0x0e55eda4a4aaa449ULL, 0x8850752828a0285dULL, 0x31b8865c5c6d5cdaULL, 0x3fed6bf8f8c7f893ULL, 0xa411c28686228644ULL, }; static const u64 C4[256] = { 0xc07830d818186018ULL, 0x05af462623238c23ULL, 0x7ef991b8c6c63fc6ULL, 0x136fcdfbe8e887e8ULL, 0x4ca113cb87872687ULL, 0xa9626d11b8b8dab8ULL, 0x0805020901010401ULL, 0x426e9e0d4f4f214fULL, 0xadee6c9b3636d836ULL, 0x590451ffa6a6a2a6ULL, 0xdebdb90cd2d26fd2ULL, 0xfb06f70ef5f5f3f5ULL, 0xef80f2967979f979ULL, 0x5fcede306f6fa16fULL, 0xfcef3f6d91917e91ULL, 0xaa07a4f852525552ULL, 0x27fdc04760609d60ULL, 0x89766535bcbccabcULL, 0xaccd2b379b9b569bULL, 0x048c018a8e8e028eULL, 0x71155bd2a3a3b6a3ULL, 0x603c186c0c0c300cULL, 0xff8af6847b7bf17bULL, 0xb5e16a803535d435ULL, 0xe8693af51d1d741dULL, 0x5347ddb3e0e0a7e0ULL, 0xf6acb321d7d77bd7ULL, 0x5eed999cc2c22fc2ULL, 0x6d965c432e2eb82eULL, 0x627a96294b4b314bULL, 0xa321e15dfefedffeULL, 0x8216aed557574157ULL, 0xa8412abd15155415ULL, 0x9fb6eee87777c177ULL, 0xa5eb6e923737dc37ULL, 0x7b56d79ee5e5b3e5ULL, 0x8cd923139f9f469fULL, 0xd317fd23f0f0e7f0ULL, 0x6a7f94204a4a354aULL, 0x9e95a944dada4fdaULL, 0xfa25b0a258587d58ULL, 0x06ca8fcfc9c903c9ULL, 0x558d527c2929a429ULL, 0x5022145a0a0a280aULL, 0xe14f7f50b1b1feb1ULL, 0x691a5dc9a0a0baa0ULL, 0x7fdad6146b6bb16bULL, 0x5cab17d985852e85ULL, 0x8173673cbdbdcebdULL, 0xd234ba8f5d5d695dULL, 0x8050209010104010ULL, 0xf303f507f4f4f7f4ULL, 0x16c08bddcbcb0bcbULL, 0xedc67cd33e3ef83eULL, 0x28110a2d05051405ULL, 0x1fe6ce7867678167ULL, 0x7353d597e4e4b7e4ULL, 0x25bb4e0227279c27ULL, 0x3258827341411941ULL, 0x2c9d0ba78b8b168bULL, 0x510153f6a7a7a6a7ULL, 0xcf94fab27d7de97dULL, 0xdcfb374995956e95ULL, 0x8e9fad56d8d847d8ULL, 0x8b30eb70fbfbcbfbULL, 0x2371c1cdeeee9feeULL, 0xc791f8bb7c7ced7cULL, 0x17e3cc7166668566ULL, 0xa68ea77bdddd53ddULL, 0xb84b2eaf17175c17ULL, 0x02468e4547470147ULL, 0x84dc211a9e9e429eULL, 0x1ec589d4caca0fcaULL, 0x75995a582d2db42dULL, 0x9179632ebfbfc6bfULL, 0x381b0e3f07071c07ULL, 0x012347acadad8eadULL, 0xea2fb4b05a5a755aULL, 0x6cb51bef83833683ULL, 0x85ff66b63333cc33ULL, 0x3ff2c65c63639163ULL, 0x100a041202020802ULL, 0x39384993aaaa92aaULL, 0xafa8e2de7171d971ULL, 0x0ecf8dc6c8c807c8ULL, 0xc87d32d119196419ULL, 0x7270923b49493949ULL, 0x869aaf5fd9d943d9ULL, 0xc31df931f2f2eff2ULL, 0x4b48dba8e3e3abe3ULL, 0xe22ab6b95b5b715bULL, 0x34920dbc88881a88ULL, 0xa4c8293e9a9a529aULL, 0x2dbe4c0b26269826ULL, 0x8dfa64bf3232c832ULL, 0xe94a7d59b0b0fab0ULL, 0x1b6acff2e9e983e9ULL, 0x78331e770f0f3c0fULL, 0xe6a6b733d5d573d5ULL, 0x74ba1df480803a80ULL, 0x997c6127bebec2beULL, 0x26de87ebcdcd13cdULL, 0xbde468893434d034ULL, 0x7a75903248483d48ULL, 0xab24e354ffffdbffULL, 0xf78ff48d7a7af57aULL, 0xf4ea3d6490907a90ULL, 0xc23ebe9d5f5f615fULL, 0x1da0403d20208020ULL, 0x67d5d00f6868bd68ULL, 0xd07234ca1a1a681aULL, 0x192c41b7aeae82aeULL, 0xc95e757db4b4eab4ULL, 0x9a19a8ce54544d54ULL, 0xece53b7f93937693ULL, 0x0daa442f22228822ULL, 0x07e9c86364648d64ULL, 0xdb12ff2af1f1e3f1ULL, 0xbfa2e6cc7373d173ULL, 0x905a248212124812ULL, 0x3a5d807a40401d40ULL, 0x4028104808082008ULL, 0x56e89b95c3c32bc3ULL, 0x337bc5dfecec97ecULL, 0x9690ab4ddbdb4bdbULL, 0x611f5fc0a1a1bea1ULL, 0x1c8307918d8d0e8dULL, 0xf5c97ac83d3df43dULL, 0xccf1335b97976697ULL, 0x0000000000000000ULL, 0x36d483f9cfcf1bcfULL, 0x4587566e2b2bac2bULL, 0x97b3ece17676c576ULL, 0x64b019e682823282ULL, 0xfea9b128d6d67fd6ULL, 0xd87736c31b1b6c1bULL, 0xc15b7774b5b5eeb5ULL, 0x112943beafaf86afULL, 0x77dfd41d6a6ab56aULL, 0xba0da0ea50505d50ULL, 0x124c8a5745450945ULL, 0xcb18fb38f3f3ebf3ULL, 0x9df060ad3030c030ULL, 0x2b74c3c4efef9befULL, 0xe5c37eda3f3ffc3fULL, 0x921caac755554955ULL, 0x791059dba2a2b2a2ULL, 0x0365c9e9eaea8feaULL, 0x0fecca6a65658965ULL, 0xb9686903babad2baULL, 0x65935e4a2f2fbc2fULL, 0x4ee79d8ec0c027c0ULL, 0xbe81a160dede5fdeULL, 0xe06c38fc1c1c701cULL, 0xbb2ee746fdfdd3fdULL, 0x52649a1f4d4d294dULL, 0xe4e0397692927292ULL, 0x8fbceafa7575c975ULL, 0x301e0c3606061806ULL, 0x249809ae8a8a128aULL, 0xf940794bb2b2f2b2ULL, 0x6359d185e6e6bfe6ULL, 0x70361c7e0e0e380eULL, 0xf8633ee71f1f7c1fULL, 0x37f7c45562629562ULL, 0xeea3b53ad4d477d4ULL, 0x29324d81a8a89aa8ULL, 0xc4f4315296966296ULL, 0x9b3aef62f9f9c3f9ULL, 0x66f697a3c5c533c5ULL, 0x35b14a1025259425ULL, 0xf220b2ab59597959ULL, 0x54ae15d084842a84ULL, 0xb7a7e4c57272d572ULL, 0xd5dd72ec3939e439ULL, 0x5a6198164c4c2d4cULL, 0xca3bbc945e5e655eULL, 0xe785f09f7878fd78ULL, 0xddd870e53838e038ULL, 0x148605988c8c0a8cULL, 0xc6b2bf17d1d163d1ULL, 0x410b57e4a5a5aea5ULL, 0x434dd9a1e2e2afe2ULL, 0x2ff8c24e61619961ULL, 0xf1457b42b3b3f6b3ULL, 0x15a5423421218421ULL, 0x94d625089c9c4a9cULL, 0xf0663cee1e1e781eULL, 0x2252866143431143ULL, 0x76fc93b1c7c73bc7ULL, 0xb32be54ffcfcd7fcULL, 0x2014082404041004ULL, 0xb208a2e351515951ULL, 0xbcc72f2599995e99ULL, 0x4fc4da226d6da96dULL, 0x68391a650d0d340dULL, 0x8335e979fafacffaULL, 0xb684a369dfdf5bdfULL, 0xd79bfca97e7ee57eULL, 0x3db4481924249024ULL, 0xc5d776fe3b3bec3bULL, 0x313d4b9aabab96abULL, 0x3ed181f0cece1fceULL, 0x8855229911114411ULL, 0x0c8903838f8f068fULL, 0x4a6b9c044e4e254eULL, 0xd1517366b7b7e6b7ULL, 0x0b60cbe0ebeb8bebULL, 0xfdcc78c13c3cf03cULL, 0x7cbf1ffd81813e81ULL, 0xd4fe354094946a94ULL, 0xeb0cf31cf7f7fbf7ULL, 0xa1676f18b9b9deb9ULL, 0x985f268b13134c13ULL, 0x7d9c58512c2cb02cULL, 0xd6b8bb05d3d36bd3ULL, 0x6b5cd38ce7e7bbe7ULL, 0x57cbdc396e6ea56eULL, 0x6ef395aac4c437c4ULL, 0x180f061b03030c03ULL, 0x8a13acdc56564556ULL, 0x1a49885e44440d44ULL, 0xdf9efea07f7fe17fULL, 0x21374f88a9a99ea9ULL, 0x4d8254672a2aa82aULL, 0xb16d6b0abbbbd6bbULL, 0x46e29f87c1c123c1ULL, 0xa202a6f153535153ULL, 0xae8ba572dcdc57dcULL, 0x582716530b0b2c0bULL, 0x9cd327019d9d4e9dULL, 0x47c1d82b6c6cad6cULL, 0x95f562a43131c431ULL, 0x87b9e8f37474cd74ULL, 0xe309f115f6f6fff6ULL, 0x0a438c4c46460546ULL, 0x092645a5acac8aacULL, 0x3c970fb589891e89ULL, 0xa04428b414145014ULL, 0x5b42dfbae1e1a3e1ULL, 0xb04e2ca616165816ULL, 0xcdd274f73a3ae83aULL, 0x6fd0d2066969b969ULL, 0x482d124109092409ULL, 0xa7ade0d77070dd70ULL, 0xd954716fb6b6e2b6ULL, 0xceb7bd1ed0d067d0ULL, 0x3b7ec7d6eded93edULL, 0x2edb85e2cccc17ccULL, 0x2a57846842421542ULL, 0xb4c22d2c98985a98ULL, 0x490e55eda4a4aaa4ULL, 0x5d8850752828a028ULL, 0xda31b8865c5c6d5cULL, 0x933fed6bf8f8c7f8ULL, 0x44a411c286862286ULL, }; static const u64 C5[256] = { 0x18c07830d8181860ULL, 0x2305af462623238cULL, 0xc67ef991b8c6c63fULL, 0xe8136fcdfbe8e887ULL, 0x874ca113cb878726ULL, 0xb8a9626d11b8b8daULL, 0x0108050209010104ULL, 0x4f426e9e0d4f4f21ULL, 0x36adee6c9b3636d8ULL, 0xa6590451ffa6a6a2ULL, 0xd2debdb90cd2d26fULL, 0xf5fb06f70ef5f5f3ULL, 0x79ef80f2967979f9ULL, 0x6f5fcede306f6fa1ULL, 0x91fcef3f6d91917eULL, 0x52aa07a4f8525255ULL, 0x6027fdc04760609dULL, 0xbc89766535bcbccaULL, 0x9baccd2b379b9b56ULL, 0x8e048c018a8e8e02ULL, 0xa371155bd2a3a3b6ULL, 0x0c603c186c0c0c30ULL, 0x7bff8af6847b7bf1ULL, 0x35b5e16a803535d4ULL, 0x1de8693af51d1d74ULL, 0xe05347ddb3e0e0a7ULL, 0xd7f6acb321d7d77bULL, 0xc25eed999cc2c22fULL, 0x2e6d965c432e2eb8ULL, 0x4b627a96294b4b31ULL, 0xfea321e15dfefedfULL, 0x578216aed5575741ULL, 0x15a8412abd151554ULL, 0x779fb6eee87777c1ULL, 0x37a5eb6e923737dcULL, 0xe57b56d79ee5e5b3ULL, 0x9f8cd923139f9f46ULL, 0xf0d317fd23f0f0e7ULL, 0x4a6a7f94204a4a35ULL, 0xda9e95a944dada4fULL, 0x58fa25b0a258587dULL, 0xc906ca8fcfc9c903ULL, 0x29558d527c2929a4ULL, 0x0a5022145a0a0a28ULL, 0xb1e14f7f50b1b1feULL, 0xa0691a5dc9a0a0baULL, 0x6b7fdad6146b6bb1ULL, 0x855cab17d985852eULL, 0xbd8173673cbdbdceULL, 0x5dd234ba8f5d5d69ULL, 0x1080502090101040ULL, 0xf4f303f507f4f4f7ULL, 0xcb16c08bddcbcb0bULL, 0x3eedc67cd33e3ef8ULL, 0x0528110a2d050514ULL, 0x671fe6ce78676781ULL, 0xe47353d597e4e4b7ULL, 0x2725bb4e0227279cULL, 0x4132588273414119ULL, 0x8b2c9d0ba78b8b16ULL, 0xa7510153f6a7a7a6ULL, 0x7dcf94fab27d7de9ULL, 0x95dcfb374995956eULL, 0xd88e9fad56d8d847ULL, 0xfb8b30eb70fbfbcbULL, 0xee2371c1cdeeee9fULL, 0x7cc791f8bb7c7cedULL, 0x6617e3cc71666685ULL, 0xdda68ea77bdddd53ULL, 0x17b84b2eaf17175cULL, 0x4702468e45474701ULL, 0x9e84dc211a9e9e42ULL, 0xca1ec589d4caca0fULL, 0x2d75995a582d2db4ULL, 0xbf9179632ebfbfc6ULL, 0x07381b0e3f07071cULL, 0xad012347acadad8eULL, 0x5aea2fb4b05a5a75ULL, 0x836cb51bef838336ULL, 0x3385ff66b63333ccULL, 0x633ff2c65c636391ULL, 0x02100a0412020208ULL, 0xaa39384993aaaa92ULL, 0x71afa8e2de7171d9ULL, 0xc80ecf8dc6c8c807ULL, 0x19c87d32d1191964ULL, 0x497270923b494939ULL, 0xd9869aaf5fd9d943ULL, 0xf2c31df931f2f2efULL, 0xe34b48dba8e3e3abULL, 0x5be22ab6b95b5b71ULL, 0x8834920dbc88881aULL, 0x9aa4c8293e9a9a52ULL, 0x262dbe4c0b262698ULL, 0x328dfa64bf3232c8ULL, 0xb0e94a7d59b0b0faULL, 0xe91b6acff2e9e983ULL, 0x0f78331e770f0f3cULL, 0xd5e6a6b733d5d573ULL, 0x8074ba1df480803aULL, 0xbe997c6127bebec2ULL, 0xcd26de87ebcdcd13ULL, 0x34bde468893434d0ULL, 0x487a75903248483dULL, 0xffab24e354ffffdbULL, 0x7af78ff48d7a7af5ULL, 0x90f4ea3d6490907aULL, 0x5fc23ebe9d5f5f61ULL, 0x201da0403d202080ULL, 0x6867d5d00f6868bdULL, 0x1ad07234ca1a1a68ULL, 0xae192c41b7aeae82ULL, 0xb4c95e757db4b4eaULL, 0x549a19a8ce54544dULL, 0x93ece53b7f939376ULL, 0x220daa442f222288ULL, 0x6407e9c86364648dULL, 0xf1db12ff2af1f1e3ULL, 0x73bfa2e6cc7373d1ULL, 0x12905a2482121248ULL, 0x403a5d807a40401dULL, 0x0840281048080820ULL, 0xc356e89b95c3c32bULL, 0xec337bc5dfecec97ULL, 0xdb9690ab4ddbdb4bULL, 0xa1611f5fc0a1a1beULL, 0x8d1c8307918d8d0eULL, 0x3df5c97ac83d3df4ULL, 0x97ccf1335b979766ULL, 0x0000000000000000ULL, 0xcf36d483f9cfcf1bULL, 0x2b4587566e2b2bacULL, 0x7697b3ece17676c5ULL, 0x8264b019e6828232ULL, 0xd6fea9b128d6d67fULL, 0x1bd87736c31b1b6cULL, 0xb5c15b7774b5b5eeULL, 0xaf112943beafaf86ULL, 0x6a77dfd41d6a6ab5ULL, 0x50ba0da0ea50505dULL, 0x45124c8a57454509ULL, 0xf3cb18fb38f3f3ebULL, 0x309df060ad3030c0ULL, 0xef2b74c3c4efef9bULL, 0x3fe5c37eda3f3ffcULL, 0x55921caac7555549ULL, 0xa2791059dba2a2b2ULL, 0xea0365c9e9eaea8fULL, 0x650fecca6a656589ULL, 0xbab9686903babad2ULL, 0x2f65935e4a2f2fbcULL, 0xc04ee79d8ec0c027ULL, 0xdebe81a160dede5fULL, 0x1ce06c38fc1c1c70ULL, 0xfdbb2ee746fdfdd3ULL, 0x4d52649a1f4d4d29ULL, 0x92e4e03976929272ULL, 0x758fbceafa7575c9ULL, 0x06301e0c36060618ULL, 0x8a249809ae8a8a12ULL, 0xb2f940794bb2b2f2ULL, 0xe66359d185e6e6bfULL, 0x0e70361c7e0e0e38ULL, 0x1ff8633ee71f1f7cULL, 0x6237f7c455626295ULL, 0xd4eea3b53ad4d477ULL, 0xa829324d81a8a89aULL, 0x96c4f43152969662ULL, 0xf99b3aef62f9f9c3ULL, 0xc566f697a3c5c533ULL, 0x2535b14a10252594ULL, 0x59f220b2ab595979ULL, 0x8454ae15d084842aULL, 0x72b7a7e4c57272d5ULL, 0x39d5dd72ec3939e4ULL, 0x4c5a6198164c4c2dULL, 0x5eca3bbc945e5e65ULL, 0x78e785f09f7878fdULL, 0x38ddd870e53838e0ULL, 0x8c148605988c8c0aULL, 0xd1c6b2bf17d1d163ULL, 0xa5410b57e4a5a5aeULL, 0xe2434dd9a1e2e2afULL, 0x612ff8c24e616199ULL, 0xb3f1457b42b3b3f6ULL, 0x2115a54234212184ULL, 0x9c94d625089c9c4aULL, 0x1ef0663cee1e1e78ULL, 0x4322528661434311ULL, 0xc776fc93b1c7c73bULL, 0xfcb32be54ffcfcd7ULL, 0x0420140824040410ULL, 0x51b208a2e3515159ULL, 0x99bcc72f2599995eULL, 0x6d4fc4da226d6da9ULL, 0x0d68391a650d0d34ULL, 0xfa8335e979fafacfULL, 0xdfb684a369dfdf5bULL, 0x7ed79bfca97e7ee5ULL, 0x243db44819242490ULL, 0x3bc5d776fe3b3becULL, 0xab313d4b9aabab96ULL, 0xce3ed181f0cece1fULL, 0x1188552299111144ULL, 0x8f0c8903838f8f06ULL, 0x4e4a6b9c044e4e25ULL, 0xb7d1517366b7b7e6ULL, 0xeb0b60cbe0ebeb8bULL, 0x3cfdcc78c13c3cf0ULL, 0x817cbf1ffd81813eULL, 0x94d4fe354094946aULL, 0xf7eb0cf31cf7f7fbULL, 0xb9a1676f18b9b9deULL, 0x13985f268b13134cULL, 0x2c7d9c58512c2cb0ULL, 0xd3d6b8bb05d3d36bULL, 0xe76b5cd38ce7e7bbULL, 0x6e57cbdc396e6ea5ULL, 0xc46ef395aac4c437ULL, 0x03180f061b03030cULL, 0x568a13acdc565645ULL, 0x441a49885e44440dULL, 0x7fdf9efea07f7fe1ULL, 0xa921374f88a9a99eULL, 0x2a4d8254672a2aa8ULL, 0xbbb16d6b0abbbbd6ULL, 0xc146e29f87c1c123ULL, 0x53a202a6f1535351ULL, 0xdcae8ba572dcdc57ULL, 0x0b582716530b0b2cULL, 0x9d9cd327019d9d4eULL, 0x6c47c1d82b6c6cadULL, 0x3195f562a43131c4ULL, 0x7487b9e8f37474cdULL, 0xf6e309f115f6f6ffULL, 0x460a438c4c464605ULL, 0xac092645a5acac8aULL, 0x893c970fb589891eULL, 0x14a04428b4141450ULL, 0xe15b42dfbae1e1a3ULL, 0x16b04e2ca6161658ULL, 0x3acdd274f73a3ae8ULL, 0x696fd0d2066969b9ULL, 0x09482d1241090924ULL, 0x70a7ade0d77070ddULL, 0xb6d954716fb6b6e2ULL, 0xd0ceb7bd1ed0d067ULL, 0xed3b7ec7d6eded93ULL, 0xcc2edb85e2cccc17ULL, 0x422a578468424215ULL, 0x98b4c22d2c98985aULL, 0xa4490e55eda4a4aaULL, 0x285d8850752828a0ULL, 0x5cda31b8865c5c6dULL, 0xf8933fed6bf8f8c7ULL, 0x8644a411c2868622ULL, }; static const u64 C6[256] = { 0x6018c07830d81818ULL, 0x8c2305af46262323ULL, 0x3fc67ef991b8c6c6ULL, 0x87e8136fcdfbe8e8ULL, 0x26874ca113cb8787ULL, 0xdab8a9626d11b8b8ULL, 0x0401080502090101ULL, 0x214f426e9e0d4f4fULL, 0xd836adee6c9b3636ULL, 0xa2a6590451ffa6a6ULL, 0x6fd2debdb90cd2d2ULL, 0xf3f5fb06f70ef5f5ULL, 0xf979ef80f2967979ULL, 0xa16f5fcede306f6fULL, 0x7e91fcef3f6d9191ULL, 0x5552aa07a4f85252ULL, 0x9d6027fdc0476060ULL, 0xcabc89766535bcbcULL, 0x569baccd2b379b9bULL, 0x028e048c018a8e8eULL, 0xb6a371155bd2a3a3ULL, 0x300c603c186c0c0cULL, 0xf17bff8af6847b7bULL, 0xd435b5e16a803535ULL, 0x741de8693af51d1dULL, 0xa7e05347ddb3e0e0ULL, 0x7bd7f6acb321d7d7ULL, 0x2fc25eed999cc2c2ULL, 0xb82e6d965c432e2eULL, 0x314b627a96294b4bULL, 0xdffea321e15dfefeULL, 0x41578216aed55757ULL, 0x5415a8412abd1515ULL, 0xc1779fb6eee87777ULL, 0xdc37a5eb6e923737ULL, 0xb3e57b56d79ee5e5ULL, 0x469f8cd923139f9fULL, 0xe7f0d317fd23f0f0ULL, 0x354a6a7f94204a4aULL, 0x4fda9e95a944dadaULL, 0x7d58fa25b0a25858ULL, 0x03c906ca8fcfc9c9ULL, 0xa429558d527c2929ULL, 0x280a5022145a0a0aULL, 0xfeb1e14f7f50b1b1ULL, 0xbaa0691a5dc9a0a0ULL, 0xb16b7fdad6146b6bULL, 0x2e855cab17d98585ULL, 0xcebd8173673cbdbdULL, 0x695dd234ba8f5d5dULL, 0x4010805020901010ULL, 0xf7f4f303f507f4f4ULL, 0x0bcb16c08bddcbcbULL, 0xf83eedc67cd33e3eULL, 0x140528110a2d0505ULL, 0x81671fe6ce786767ULL, 0xb7e47353d597e4e4ULL, 0x9c2725bb4e022727ULL, 0x1941325882734141ULL, 0x168b2c9d0ba78b8bULL, 0xa6a7510153f6a7a7ULL, 0xe97dcf94fab27d7dULL, 0x6e95dcfb37499595ULL, 0x47d88e9fad56d8d8ULL, 0xcbfb8b30eb70fbfbULL, 0x9fee2371c1cdeeeeULL, 0xed7cc791f8bb7c7cULL, 0x856617e3cc716666ULL, 0x53dda68ea77bddddULL, 0x5c17b84b2eaf1717ULL, 0x014702468e454747ULL, 0x429e84dc211a9e9eULL, 0x0fca1ec589d4cacaULL, 0xb42d75995a582d2dULL, 0xc6bf9179632ebfbfULL, 0x1c07381b0e3f0707ULL, 0x8ead012347acadadULL, 0x755aea2fb4b05a5aULL, 0x36836cb51bef8383ULL, 0xcc3385ff66b63333ULL, 0x91633ff2c65c6363ULL, 0x0802100a04120202ULL, 0x92aa39384993aaaaULL, 0xd971afa8e2de7171ULL, 0x07c80ecf8dc6c8c8ULL, 0x6419c87d32d11919ULL, 0x39497270923b4949ULL, 0x43d9869aaf5fd9d9ULL, 0xeff2c31df931f2f2ULL, 0xabe34b48dba8e3e3ULL, 0x715be22ab6b95b5bULL, 0x1a8834920dbc8888ULL, 0x529aa4c8293e9a9aULL, 0x98262dbe4c0b2626ULL, 0xc8328dfa64bf3232ULL, 0xfab0e94a7d59b0b0ULL, 0x83e91b6acff2e9e9ULL, 0x3c0f78331e770f0fULL, 0x73d5e6a6b733d5d5ULL, 0x3a8074ba1df48080ULL, 0xc2be997c6127bebeULL, 0x13cd26de87ebcdcdULL, 0xd034bde468893434ULL, 0x3d487a7590324848ULL, 0xdbffab24e354ffffULL, 0xf57af78ff48d7a7aULL, 0x7a90f4ea3d649090ULL, 0x615fc23ebe9d5f5fULL, 0x80201da0403d2020ULL, 0xbd6867d5d00f6868ULL, 0x681ad07234ca1a1aULL, 0x82ae192c41b7aeaeULL, 0xeab4c95e757db4b4ULL, 0x4d549a19a8ce5454ULL, 0x7693ece53b7f9393ULL, 0x88220daa442f2222ULL, 0x8d6407e9c8636464ULL, 0xe3f1db12ff2af1f1ULL, 0xd173bfa2e6cc7373ULL, 0x4812905a24821212ULL, 0x1d403a5d807a4040ULL, 0x2008402810480808ULL, 0x2bc356e89b95c3c3ULL, 0x97ec337bc5dfececULL, 0x4bdb9690ab4ddbdbULL, 0xbea1611f5fc0a1a1ULL, 0x0e8d1c8307918d8dULL, 0xf43df5c97ac83d3dULL, 0x6697ccf1335b9797ULL, 0x0000000000000000ULL, 0x1bcf36d483f9cfcfULL, 0xac2b4587566e2b2bULL, 0xc57697b3ece17676ULL, 0x328264b019e68282ULL, 0x7fd6fea9b128d6d6ULL, 0x6c1bd87736c31b1bULL, 0xeeb5c15b7774b5b5ULL, 0x86af112943beafafULL, 0xb56a77dfd41d6a6aULL, 0x5d50ba0da0ea5050ULL, 0x0945124c8a574545ULL, 0xebf3cb18fb38f3f3ULL, 0xc0309df060ad3030ULL, 0x9bef2b74c3c4efefULL, 0xfc3fe5c37eda3f3fULL, 0x4955921caac75555ULL, 0xb2a2791059dba2a2ULL, 0x8fea0365c9e9eaeaULL, 0x89650fecca6a6565ULL, 0xd2bab9686903babaULL, 0xbc2f65935e4a2f2fULL, 0x27c04ee79d8ec0c0ULL, 0x5fdebe81a160dedeULL, 0x701ce06c38fc1c1cULL, 0xd3fdbb2ee746fdfdULL, 0x294d52649a1f4d4dULL, 0x7292e4e039769292ULL, 0xc9758fbceafa7575ULL, 0x1806301e0c360606ULL, 0x128a249809ae8a8aULL, 0xf2b2f940794bb2b2ULL, 0xbfe66359d185e6e6ULL, 0x380e70361c7e0e0eULL, 0x7c1ff8633ee71f1fULL, 0x956237f7c4556262ULL, 0x77d4eea3b53ad4d4ULL, 0x9aa829324d81a8a8ULL, 0x6296c4f431529696ULL, 0xc3f99b3aef62f9f9ULL, 0x33c566f697a3c5c5ULL, 0x942535b14a102525ULL, 0x7959f220b2ab5959ULL, 0x2a8454ae15d08484ULL, 0xd572b7a7e4c57272ULL, 0xe439d5dd72ec3939ULL, 0x2d4c5a6198164c4cULL, 0x655eca3bbc945e5eULL, 0xfd78e785f09f7878ULL, 0xe038ddd870e53838ULL, 0x0a8c148605988c8cULL, 0x63d1c6b2bf17d1d1ULL, 0xaea5410b57e4a5a5ULL, 0xafe2434dd9a1e2e2ULL, 0x99612ff8c24e6161ULL, 0xf6b3f1457b42b3b3ULL, 0x842115a542342121ULL, 0x4a9c94d625089c9cULL, 0x781ef0663cee1e1eULL, 0x1143225286614343ULL, 0x3bc776fc93b1c7c7ULL, 0xd7fcb32be54ffcfcULL, 0x1004201408240404ULL, 0x5951b208a2e35151ULL, 0x5e99bcc72f259999ULL, 0xa96d4fc4da226d6dULL, 0x340d68391a650d0dULL, 0xcffa8335e979fafaULL, 0x5bdfb684a369dfdfULL, 0xe57ed79bfca97e7eULL, 0x90243db448192424ULL, 0xec3bc5d776fe3b3bULL, 0x96ab313d4b9aababULL, 0x1fce3ed181f0ceceULL, 0x4411885522991111ULL, 0x068f0c8903838f8fULL, 0x254e4a6b9c044e4eULL, 0xe6b7d1517366b7b7ULL, 0x8beb0b60cbe0ebebULL, 0xf03cfdcc78c13c3cULL, 0x3e817cbf1ffd8181ULL, 0x6a94d4fe35409494ULL, 0xfbf7eb0cf31cf7f7ULL, 0xdeb9a1676f18b9b9ULL, 0x4c13985f268b1313ULL, 0xb02c7d9c58512c2cULL, 0x6bd3d6b8bb05d3d3ULL, 0xbbe76b5cd38ce7e7ULL, 0xa56e57cbdc396e6eULL, 0x37c46ef395aac4c4ULL, 0x0c03180f061b0303ULL, 0x45568a13acdc5656ULL, 0x0d441a49885e4444ULL, 0xe17fdf9efea07f7fULL, 0x9ea921374f88a9a9ULL, 0xa82a4d8254672a2aULL, 0xd6bbb16d6b0abbbbULL, 0x23c146e29f87c1c1ULL, 0x5153a202a6f15353ULL, 0x57dcae8ba572dcdcULL, 0x2c0b582716530b0bULL, 0x4e9d9cd327019d9dULL, 0xad6c47c1d82b6c6cULL, 0xc43195f562a43131ULL, 0xcd7487b9e8f37474ULL, 0xfff6e309f115f6f6ULL, 0x05460a438c4c4646ULL, 0x8aac092645a5acacULL, 0x1e893c970fb58989ULL, 0x5014a04428b41414ULL, 0xa3e15b42dfbae1e1ULL, 0x5816b04e2ca61616ULL, 0xe83acdd274f73a3aULL, 0xb9696fd0d2066969ULL, 0x2409482d12410909ULL, 0xdd70a7ade0d77070ULL, 0xe2b6d954716fb6b6ULL, 0x67d0ceb7bd1ed0d0ULL, 0x93ed3b7ec7d6ededULL, 0x17cc2edb85e2ccccULL, 0x15422a5784684242ULL, 0x5a98b4c22d2c9898ULL, 0xaaa4490e55eda4a4ULL, 0xa0285d8850752828ULL, 0x6d5cda31b8865c5cULL, 0xc7f8933fed6bf8f8ULL, 0x228644a411c28686ULL, }; static const u64 C7[256] = { 0x186018c07830d818ULL, 0x238c2305af462623ULL, 0xc63fc67ef991b8c6ULL, 0xe887e8136fcdfbe8ULL, 0x8726874ca113cb87ULL, 0xb8dab8a9626d11b8ULL, 0x0104010805020901ULL, 0x4f214f426e9e0d4fULL, 0x36d836adee6c9b36ULL, 0xa6a2a6590451ffa6ULL, 0xd26fd2debdb90cd2ULL, 0xf5f3f5fb06f70ef5ULL, 0x79f979ef80f29679ULL, 0x6fa16f5fcede306fULL, 0x917e91fcef3f6d91ULL, 0x525552aa07a4f852ULL, 0x609d6027fdc04760ULL, 0xbccabc89766535bcULL, 0x9b569baccd2b379bULL, 0x8e028e048c018a8eULL, 0xa3b6a371155bd2a3ULL, 0x0c300c603c186c0cULL, 0x7bf17bff8af6847bULL, 0x35d435b5e16a8035ULL, 0x1d741de8693af51dULL, 0xe0a7e05347ddb3e0ULL, 0xd77bd7f6acb321d7ULL, 0xc22fc25eed999cc2ULL, 0x2eb82e6d965c432eULL, 0x4b314b627a96294bULL, 0xfedffea321e15dfeULL, 0x5741578216aed557ULL, 0x155415a8412abd15ULL, 0x77c1779fb6eee877ULL, 0x37dc37a5eb6e9237ULL, 0xe5b3e57b56d79ee5ULL, 0x9f469f8cd923139fULL, 0xf0e7f0d317fd23f0ULL, 0x4a354a6a7f94204aULL, 0xda4fda9e95a944daULL, 0x587d58fa25b0a258ULL, 0xc903c906ca8fcfc9ULL, 0x29a429558d527c29ULL, 0x0a280a5022145a0aULL, 0xb1feb1e14f7f50b1ULL, 0xa0baa0691a5dc9a0ULL, 0x6bb16b7fdad6146bULL, 0x852e855cab17d985ULL, 0xbdcebd8173673cbdULL, 0x5d695dd234ba8f5dULL, 0x1040108050209010ULL, 0xf4f7f4f303f507f4ULL, 0xcb0bcb16c08bddcbULL, 0x3ef83eedc67cd33eULL, 0x05140528110a2d05ULL, 0x6781671fe6ce7867ULL, 0xe4b7e47353d597e4ULL, 0x279c2725bb4e0227ULL, 0x4119413258827341ULL, 0x8b168b2c9d0ba78bULL, 0xa7a6a7510153f6a7ULL, 0x7de97dcf94fab27dULL, 0x956e95dcfb374995ULL, 0xd847d88e9fad56d8ULL, 0xfbcbfb8b30eb70fbULL, 0xee9fee2371c1cdeeULL, 0x7ced7cc791f8bb7cULL, 0x66856617e3cc7166ULL, 0xdd53dda68ea77bddULL, 0x175c17b84b2eaf17ULL, 0x47014702468e4547ULL, 0x9e429e84dc211a9eULL, 0xca0fca1ec589d4caULL, 0x2db42d75995a582dULL, 0xbfc6bf9179632ebfULL, 0x071c07381b0e3f07ULL, 0xad8ead012347acadULL, 0x5a755aea2fb4b05aULL, 0x8336836cb51bef83ULL, 0x33cc3385ff66b633ULL, 0x6391633ff2c65c63ULL, 0x020802100a041202ULL, 0xaa92aa39384993aaULL, 0x71d971afa8e2de71ULL, 0xc807c80ecf8dc6c8ULL, 0x196419c87d32d119ULL, 0x4939497270923b49ULL, 0xd943d9869aaf5fd9ULL, 0xf2eff2c31df931f2ULL, 0xe3abe34b48dba8e3ULL, 0x5b715be22ab6b95bULL, 0x881a8834920dbc88ULL, 0x9a529aa4c8293e9aULL, 0x2698262dbe4c0b26ULL, 0x32c8328dfa64bf32ULL, 0xb0fab0e94a7d59b0ULL, 0xe983e91b6acff2e9ULL, 0x0f3c0f78331e770fULL, 0xd573d5e6a6b733d5ULL, 0x803a8074ba1df480ULL, 0xbec2be997c6127beULL, 0xcd13cd26de87ebcdULL, 0x34d034bde4688934ULL, 0x483d487a75903248ULL, 0xffdbffab24e354ffULL, 0x7af57af78ff48d7aULL, 0x907a90f4ea3d6490ULL, 0x5f615fc23ebe9d5fULL, 0x2080201da0403d20ULL, 0x68bd6867d5d00f68ULL, 0x1a681ad07234ca1aULL, 0xae82ae192c41b7aeULL, 0xb4eab4c95e757db4ULL, 0x544d549a19a8ce54ULL, 0x937693ece53b7f93ULL, 0x2288220daa442f22ULL, 0x648d6407e9c86364ULL, 0xf1e3f1db12ff2af1ULL, 0x73d173bfa2e6cc73ULL, 0x124812905a248212ULL, 0x401d403a5d807a40ULL, 0x0820084028104808ULL, 0xc32bc356e89b95c3ULL, 0xec97ec337bc5dfecULL, 0xdb4bdb9690ab4ddbULL, 0xa1bea1611f5fc0a1ULL, 0x8d0e8d1c8307918dULL, 0x3df43df5c97ac83dULL, 0x976697ccf1335b97ULL, 0x0000000000000000ULL, 0xcf1bcf36d483f9cfULL, 0x2bac2b4587566e2bULL, 0x76c57697b3ece176ULL, 0x82328264b019e682ULL, 0xd67fd6fea9b128d6ULL, 0x1b6c1bd87736c31bULL, 0xb5eeb5c15b7774b5ULL, 0xaf86af112943beafULL, 0x6ab56a77dfd41d6aULL, 0x505d50ba0da0ea50ULL, 0x450945124c8a5745ULL, 0xf3ebf3cb18fb38f3ULL, 0x30c0309df060ad30ULL, 0xef9bef2b74c3c4efULL, 0x3ffc3fe5c37eda3fULL, 0x554955921caac755ULL, 0xa2b2a2791059dba2ULL, 0xea8fea0365c9e9eaULL, 0x6589650fecca6a65ULL, 0xbad2bab9686903baULL, 0x2fbc2f65935e4a2fULL, 0xc027c04ee79d8ec0ULL, 0xde5fdebe81a160deULL, 0x1c701ce06c38fc1cULL, 0xfdd3fdbb2ee746fdULL, 0x4d294d52649a1f4dULL, 0x927292e4e0397692ULL, 0x75c9758fbceafa75ULL, 0x061806301e0c3606ULL, 0x8a128a249809ae8aULL, 0xb2f2b2f940794bb2ULL, 0xe6bfe66359d185e6ULL, 0x0e380e70361c7e0eULL, 0x1f7c1ff8633ee71fULL, 0x62956237f7c45562ULL, 0xd477d4eea3b53ad4ULL, 0xa89aa829324d81a8ULL, 0x966296c4f4315296ULL, 0xf9c3f99b3aef62f9ULL, 0xc533c566f697a3c5ULL, 0x25942535b14a1025ULL, 0x597959f220b2ab59ULL, 0x842a8454ae15d084ULL, 0x72d572b7a7e4c572ULL, 0x39e439d5dd72ec39ULL, 0x4c2d4c5a6198164cULL, 0x5e655eca3bbc945eULL, 0x78fd78e785f09f78ULL, 0x38e038ddd870e538ULL, 0x8c0a8c148605988cULL, 0xd163d1c6b2bf17d1ULL, 0xa5aea5410b57e4a5ULL, 0xe2afe2434dd9a1e2ULL, 0x6199612ff8c24e61ULL, 0xb3f6b3f1457b42b3ULL, 0x21842115a5423421ULL, 0x9c4a9c94d625089cULL, 0x1e781ef0663cee1eULL, 0x4311432252866143ULL, 0xc73bc776fc93b1c7ULL, 0xfcd7fcb32be54ffcULL, 0x0410042014082404ULL, 0x515951b208a2e351ULL, 0x995e99bcc72f2599ULL, 0x6da96d4fc4da226dULL, 0x0d340d68391a650dULL, 0xfacffa8335e979faULL, 0xdf5bdfb684a369dfULL, 0x7ee57ed79bfca97eULL, 0x2490243db4481924ULL, 0x3bec3bc5d776fe3bULL, 0xab96ab313d4b9aabULL, 0xce1fce3ed181f0ceULL, 0x1144118855229911ULL, 0x8f068f0c8903838fULL, 0x4e254e4a6b9c044eULL, 0xb7e6b7d1517366b7ULL, 0xeb8beb0b60cbe0ebULL, 0x3cf03cfdcc78c13cULL, 0x813e817cbf1ffd81ULL, 0x946a94d4fe354094ULL, 0xf7fbf7eb0cf31cf7ULL, 0xb9deb9a1676f18b9ULL, 0x134c13985f268b13ULL, 0x2cb02c7d9c58512cULL, 0xd36bd3d6b8bb05d3ULL, 0xe7bbe76b5cd38ce7ULL, 0x6ea56e57cbdc396eULL, 0xc437c46ef395aac4ULL, 0x030c03180f061b03ULL, 0x5645568a13acdc56ULL, 0x440d441a49885e44ULL, 0x7fe17fdf9efea07fULL, 0xa99ea921374f88a9ULL, 0x2aa82a4d8254672aULL, 0xbbd6bbb16d6b0abbULL, 0xc123c146e29f87c1ULL, 0x535153a202a6f153ULL, 0xdc57dcae8ba572dcULL, 0x0b2c0b582716530bULL, 0x9d4e9d9cd327019dULL, 0x6cad6c47c1d82b6cULL, 0x31c43195f562a431ULL, 0x74cd7487b9e8f374ULL, 0xf6fff6e309f115f6ULL, 0x4605460a438c4c46ULL, 0xac8aac092645a5acULL, 0x891e893c970fb589ULL, 0x145014a04428b414ULL, 0xe1a3e15b42dfbae1ULL, 0x165816b04e2ca616ULL, 0x3ae83acdd274f73aULL, 0x69b9696fd0d20669ULL, 0x092409482d124109ULL, 0x70dd70a7ade0d770ULL, 0xb6e2b6d954716fb6ULL, 0xd067d0ceb7bd1ed0ULL, 0xed93ed3b7ec7d6edULL, 0xcc17cc2edb85e2ccULL, 0x4215422a57846842ULL, 0x985a98b4c22d2c98ULL, 0xa4aaa4490e55eda4ULL, 0x28a0285d88507528ULL, 0x5c6d5cda31b8865cULL, 0xf8c7f8933fed6bf8ULL, 0x86228644a411c286ULL, }; static const u64 rc[WHIRLPOOL_ROUNDS] = { 0x1823c6e887b8014fULL, 0x36a6d2f5796f9152ULL, 0x60bc9b8ea30c7b35ULL, 0x1de0d7c22e4bfe57ULL, 0x157737e59ff04adaULL, 0x58c9290ab1a06b85ULL, 0xbd5d10f4cb3e0567ULL, 0xe427418ba77d95d8ULL, 0xfbee7c66dd17479eULL, 0xca2dbf07ad5a8333ULL, }; /* * The core Whirlpool transform. */ static __no_kmsan_checks void wp512_process_buffer(struct wp512_ctx *wctx) { int i, r; u64 K[8]; /* the round key */ u64 block[8]; /* mu(buffer) */ u64 state[8]; /* the cipher state */ u64 L[8]; const __be64 *buffer = (const __be64 *)wctx->buffer; for (i = 0; i < 8; i++) block[i] = be64_to_cpu(buffer[i]); state[0] = block[0] ^ (K[0] = wctx->hash[0]); state[1] = block[1] ^ (K[1] = wctx->hash[1]); state[2] = block[2] ^ (K[2] = wctx->hash[2]); state[3] = block[3] ^ (K[3] = wctx->hash[3]); state[4] = block[4] ^ (K[4] = wctx->hash[4]); state[5] = block[5] ^ (K[5] = wctx->hash[5]); state[6] = block[6] ^ (K[6] = wctx->hash[6]); state[7] = block[7] ^ (K[7] = wctx->hash[7]); for (r = 0; r < WHIRLPOOL_ROUNDS; r++) { L[0] = C0[(int)(K[0] >> 56) ] ^ C1[(int)(K[7] >> 48) & 0xff] ^ C2[(int)(K[6] >> 40) & 0xff] ^ C3[(int)(K[5] >> 32) & 0xff] ^ C4[(int)(K[4] >> 24) & 0xff] ^ C5[(int)(K[3] >> 16) & 0xff] ^ C6[(int)(K[2] >> 8) & 0xff] ^ C7[(int)(K[1] ) & 0xff] ^ rc[r]; L[1] = C0[(int)(K[1] >> 56) ] ^ C1[(int)(K[0] >> 48) & 0xff] ^ C2[(int)(K[7] >> 40) & 0xff] ^ C3[(int)(K[6] >> 32) & 0xff] ^ C4[(int)(K[5] >> 24) & 0xff] ^ C5[(int)(K[4] >> 16) & 0xff] ^ C6[(int)(K[3] >> 8) & 0xff] ^ C7[(int)(K[2] ) & 0xff]; L[2] = C0[(int)(K[2] >> 56) ] ^ C1[(int)(K[1] >> 48) & 0xff] ^ C2[(int)(K[0] >> 40) & 0xff] ^ C3[(int)(K[7] >> 32) & 0xff] ^ C4[(int)(K[6] >> 24) & 0xff] ^ C5[(int)(K[5] >> 16) & 0xff] ^ C6[(int)(K[4] >> 8) & 0xff] ^ C7[(int)(K[3] ) & 0xff]; L[3] = C0[(int)(K[3] >> 56) ] ^ C1[(int)(K[2] >> 48) & 0xff] ^ C2[(int)(K[1] >> 40) & 0xff] ^ C3[(int)(K[0] >> 32) & 0xff] ^ C4[(int)(K[7] >> 24) & 0xff] ^ C5[(int)(K[6] >> 16) & 0xff] ^ C6[(int)(K[5] >> 8) & 0xff] ^ C7[(int)(K[4] ) & 0xff]; L[4] = C0[(int)(K[4] >> 56) ] ^ C1[(int)(K[3] >> 48) & 0xff] ^ C2[(int)(K[2] >> 40) & 0xff] ^ C3[(int)(K[1] >> 32) & 0xff] ^ C4[(int)(K[0] >> 24) & 0xff] ^ C5[(int)(K[7] >> 16) & 0xff] ^ C6[(int)(K[6] >> 8) & 0xff] ^ C7[(int)(K[5] ) & 0xff]; L[5] = C0[(int)(K[5] >> 56) ] ^ C1[(int)(K[4] >> 48) & 0xff] ^ C2[(int)(K[3] >> 40) & 0xff] ^ C3[(int)(K[2] >> 32) & 0xff] ^ C4[(int)(K[1] >> 24) & 0xff] ^ C5[(int)(K[0] >> 16) & 0xff] ^ C6[(int)(K[7] >> 8) & 0xff] ^ C7[(int)(K[6] ) & 0xff]; L[6] = C0[(int)(K[6] >> 56) ] ^ C1[(int)(K[5] >> 48) & 0xff] ^ C2[(int)(K[4] >> 40) & 0xff] ^ C3[(int)(K[3] >> 32) & 0xff] ^ C4[(int)(K[2] >> 24) & 0xff] ^ C5[(int)(K[1] >> 16) & 0xff] ^ C6[(int)(K[0] >> 8) & 0xff] ^ C7[(int)(K[7] ) & 0xff]; L[7] = C0[(int)(K[7] >> 56) ] ^ C1[(int)(K[6] >> 48) & 0xff] ^ C2[(int)(K[5] >> 40) & 0xff] ^ C3[(int)(K[4] >> 32) & 0xff] ^ C4[(int)(K[3] >> 24) & 0xff] ^ C5[(int)(K[2] >> 16) & 0xff] ^ C6[(int)(K[1] >> 8) & 0xff] ^ C7[(int)(K[0] ) & 0xff]; K[0] = L[0]; K[1] = L[1]; K[2] = L[2]; K[3] = L[3]; K[4] = L[4]; K[5] = L[5]; K[6] = L[6]; K[7] = L[7]; L[0] = C0[(int)(state[0] >> 56) ] ^ C1[(int)(state[7] >> 48) & 0xff] ^ C2[(int)(state[6] >> 40) & 0xff] ^ C3[(int)(state[5] >> 32) & 0xff] ^ C4[(int)(state[4] >> 24) & 0xff] ^ C5[(int)(state[3] >> 16) & 0xff] ^ C6[(int)(state[2] >> 8) & 0xff] ^ C7[(int)(state[1] ) & 0xff] ^ K[0]; L[1] = C0[(int)(state[1] >> 56) ] ^ C1[(int)(state[0] >> 48) & 0xff] ^ C2[(int)(state[7] >> 40) & 0xff] ^ C3[(int)(state[6] >> 32) & 0xff] ^ C4[(int)(state[5] >> 24) & 0xff] ^ C5[(int)(state[4] >> 16) & 0xff] ^ C6[(int)(state[3] >> 8) & 0xff] ^ C7[(int)(state[2] ) & 0xff] ^ K[1]; L[2] = C0[(int)(state[2] >> 56) ] ^ C1[(int)(state[1] >> 48) & 0xff] ^ C2[(int)(state[0] >> 40) & 0xff] ^ C3[(int)(state[7] >> 32) & 0xff] ^ C4[(int)(state[6] >> 24) & 0xff] ^ C5[(int)(state[5] >> 16) & 0xff] ^ C6[(int)(state[4] >> 8) & 0xff] ^ C7[(int)(state[3] ) & 0xff] ^ K[2]; L[3] = C0[(int)(state[3] >> 56) ] ^ C1[(int)(state[2] >> 48) & 0xff] ^ C2[(int)(state[1] >> 40) & 0xff] ^ C3[(int)(state[0] >> 32) & 0xff] ^ C4[(int)(state[7] >> 24) & 0xff] ^ C5[(int)(state[6] >> 16) & 0xff] ^ C6[(int)(state[5] >> 8) & 0xff] ^ C7[(int)(state[4] ) & 0xff] ^ K[3]; L[4] = C0[(int)(state[4] >> 56) ] ^ C1[(int)(state[3] >> 48) & 0xff] ^ C2[(int)(state[2] >> 40) & 0xff] ^ C3[(int)(state[1] >> 32) & 0xff] ^ C4[(int)(state[0] >> 24) & 0xff] ^ C5[(int)(state[7] >> 16) & 0xff] ^ C6[(int)(state[6] >> 8) & 0xff] ^ C7[(int)(state[5] ) & 0xff] ^ K[4]; L[5] = C0[(int)(state[5] >> 56) ] ^ C1[(int)(state[4] >> 48) & 0xff] ^ C2[(int)(state[3] >> 40) & 0xff] ^ C3[(int)(state[2] >> 32) & 0xff] ^ C4[(int)(state[1] >> 24) & 0xff] ^ C5[(int)(state[0] >> 16) & 0xff] ^ C6[(int)(state[7] >> 8) & 0xff] ^ C7[(int)(state[6] ) & 0xff] ^ K[5]; L[6] = C0[(int)(state[6] >> 56) ] ^ C1[(int)(state[5] >> 48) & 0xff] ^ C2[(int)(state[4] >> 40) & 0xff] ^ C3[(int)(state[3] >> 32) & 0xff] ^ C4[(int)(state[2] >> 24) & 0xff] ^ C5[(int)(state[1] >> 16) & 0xff] ^ C6[(int)(state[0] >> 8) & 0xff] ^ C7[(int)(state[7] ) & 0xff] ^ K[6]; L[7] = C0[(int)(state[7] >> 56) ] ^ C1[(int)(state[6] >> 48) & 0xff] ^ C2[(int)(state[5] >> 40) & 0xff] ^ C3[(int)(state[4] >> 32) & 0xff] ^ C4[(int)(state[3] >> 24) & 0xff] ^ C5[(int)(state[2] >> 16) & 0xff] ^ C6[(int)(state[1] >> 8) & 0xff] ^ C7[(int)(state[0] ) & 0xff] ^ K[7]; state[0] = L[0]; state[1] = L[1]; state[2] = L[2]; state[3] = L[3]; state[4] = L[4]; state[5] = L[5]; state[6] = L[6]; state[7] = L[7]; } /* * apply the Miyaguchi-Preneel compression function: */ wctx->hash[0] ^= state[0] ^ block[0]; wctx->hash[1] ^= state[1] ^ block[1]; wctx->hash[2] ^= state[2] ^ block[2]; wctx->hash[3] ^= state[3] ^ block[3]; wctx->hash[4] ^= state[4] ^ block[4]; wctx->hash[5] ^= state[5] ^ block[5]; wctx->hash[6] ^= state[6] ^ block[6]; wctx->hash[7] ^= state[7] ^ block[7]; } static int wp512_init(struct shash_desc *desc) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int i; memset(wctx->bitLength, 0, 32); wctx->bufferBits = wctx->bufferPos = 0; wctx->buffer[0] = 0; for (i = 0; i < 8; i++) { wctx->hash[i] = 0L; } return 0; } static int wp512_update(struct shash_desc *desc, const u8 *source, unsigned int len) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int sourcePos = 0; unsigned int bits_len = len * 8; // convert to number of bits int sourceGap = (8 - ((int)bits_len & 7)) & 7; int bufferRem = wctx->bufferBits & 7; int i; u32 b, carry; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; u64 value = bits_len; for (i = 31, carry = 0; i >= 0 && (carry != 0 || value != 0ULL); i--) { carry += bitLength[i] + ((u32)value & 0xff); bitLength[i] = (u8)carry; carry >>= 8; value >>= 8; } while (bits_len > 8) { b = ((source[sourcePos] << sourceGap) & 0xff) | ((source[sourcePos + 1] & 0xff) >> (8 - sourceGap)); buffer[bufferPos++] |= (u8)(b >> bufferRem); bufferBits += 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += bufferRem; bits_len -= 8; sourcePos++; } if (bits_len > 0) { b = (source[sourcePos] << sourceGap) & 0xff; buffer[bufferPos] |= b >> bufferRem; } else { b = 0; } if (bufferRem + bits_len < 8) { bufferBits += bits_len; } else { bufferPos++; bufferBits += 8 - bufferRem; bits_len -= 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += (int)bits_len; } wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; return 0; } static int wp512_final(struct shash_desc *desc, u8 *out) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int i; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; __be64 *digest = (__be64 *)out; buffer[bufferPos] |= 0x80U >> (bufferBits & 7); bufferPos++; if (bufferPos > WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { if (bufferPos < WP512_BLOCK_SIZE) memset(&buffer[bufferPos], 0, WP512_BLOCK_SIZE - bufferPos); wp512_process_buffer(wctx); bufferPos = 0; } if (bufferPos < WP512_BLOCK_SIZE - WP512_LENGTHBYTES) memset(&buffer[bufferPos], 0, (WP512_BLOCK_SIZE - WP512_LENGTHBYTES) - bufferPos); bufferPos = WP512_BLOCK_SIZE - WP512_LENGTHBYTES; memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES], bitLength, WP512_LENGTHBYTES); wp512_process_buffer(wctx); for (i = 0; i < WP512_DIGEST_SIZE/8; i++) digest[i] = cpu_to_be64(wctx->hash[i]); wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; return 0; } static int wp384_final(struct shash_desc *desc, u8 *out) { u8 D[64]; wp512_final(desc, D); memcpy(out, D, WP384_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } static int wp256_final(struct shash_desc *desc, u8 *out) { u8 D[64]; wp512_final(desc, D); memcpy(out, D, WP256_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } static struct shash_alg wp_algs[3] = { { .digestsize = WP512_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp512_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp512", .cra_driver_name = "wp512-generic", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = WP384_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp384_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp384", .cra_driver_name = "wp384-generic", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = WP256_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp256_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp256", .cra_driver_name = "wp256-generic", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init wp512_mod_init(void) { return crypto_register_shashes(wp_algs, ARRAY_SIZE(wp_algs)); } static void __exit wp512_mod_fini(void) { crypto_unregister_shashes(wp_algs, ARRAY_SIZE(wp_algs)); } MODULE_ALIAS_CRYPTO("wp512"); MODULE_ALIAS_CRYPTO("wp384"); MODULE_ALIAS_CRYPTO("wp256"); subsys_initcall(wp512_mod_init); module_exit(wp512_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Whirlpool Message Digest Algorithm");
423 421 66 417 419 422 421 421 416 415 422 73 412 411 410 38 409 413 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* * Inode orphaned in orphan file or in orphan list? */ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; }
28 3 165 28 28 30 183 183 27 29 29 13 13 13 234 235 189 175 234 238 215 218 217 214 173 163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0-only #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> #include <linux/kmemleak.h> #include <linux/user_namespace.h> struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ UCOUNTS_HASHTABLE_BITS) #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) { return &current_user_ns()->set; } static int set_is_seen(struct ctl_table_set *set) { return &current_user_ns()->set == set; } static int set_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ if (ns_capable(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ mode = table->mode & S_IROTH; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = set_permissions, }; static long ue_zero = 0; static long ue_int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(long), \ .mode = 0644, \ .proc_handler = proc_doulongvec_minmax, \ .extra1 = &ue_zero, \ .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif #ifdef CONFIG_FANOTIFY UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif { } }; #endif /* CONFIG_SYSCTL */ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < UCOUNT_COUNTS; i++) { tbl[i].data = &ns->ucount_max[i]; } ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl, ARRAY_SIZE(user_table)); } if (!ns->sysctls) { kfree(tbl); retire_sysctl_set(&ns->set); return false; } #endif return true; } void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); retire_sysctl_set(&ns->set); kfree(tbl); #endif } static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) { struct ucounts *ucounts; hlist_for_each_entry(ucounts, hashent, node) { if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) return ucounts; } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); spin_lock_irq(&ucounts_lock); hlist_add_head(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) { /* Returns true on a successful get, false if the count wraps. */ return !atomic_add_negative(1, &ucounts->count); } struct ucounts *get_ucounts(struct ucounts *ucounts) { if (!get_ucounts_or_wrap(ucounts)) { put_ucounts(ucounts); ucounts = NULL; } return ucounts; } struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; bool wrapped; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { spin_unlock_irq(&ucounts_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->ns = ns; new->uid = uid; atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { kfree(new); } else { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); if (wrapped) { put_ucounts(ucounts); return NULL; } return ucounts; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); put_user_ns(ucounts->ns); kfree(ucounts); } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { long c, old; c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; } } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type) { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; } void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); } return ret; } bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; } return (new == 0); } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) put_ucounts(iter); } } void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. */ if (new != 1) continue; if (!get_ucounts(iter)) goto dec_unwind; } return ret; dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; max = get_userns_rlimit_max(iter->ns, type); } return false; } static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL static struct ctl_table_header *user_header; static struct ctl_table empty[1]; /* * It is necessary to register the user directory in the * default set so that registrations in the child sets work * properly. */ user_header = register_sysctl_sz("user", empty, 0); kmemleak_ignore(user_header); BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init);
41 33 7 33 6 6 37 38 39 3 37 163 3 163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/mutex.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/neighbour.h> #include <net/route.h> #include <net/netevent.h> #include <net/ipv6_stubs.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/ib_sa.h> #include <rdma/ib.h> #include <rdma/rdma_netlink.h> #include <net/netlink.h> #include "core_priv.h" struct addr_req { struct list_head list; struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lock); static LIST_HEAD(req_list); static struct workqueue_struct *addr_wq; static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, .len = sizeof(struct rdma_nla_ls_gid), .validation_type = NLA_VALIDATE_MIN, .min = sizeof(struct rdma_nla_ls_gid)}, }; static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) { struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; int ret; if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) return false; ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), nlmsg_len(nlh), ib_nl_addr_policy, NULL); if (ret) return false; return true; } static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) { const struct nlattr *head, *curr; union ib_gid gid; struct addr_req *req; int len, rem; int found = 0; head = (const struct nlattr *)nlmsg_data(nlh); len = nlmsg_len(nlh); nla_for_each_attr(curr, head, len, rem) { if (curr->nla_type == LS_NLA_TYPE_DGID) memcpy(&gid, nla_data(curr), nla_len(curr)); } spin_lock_bh(&lock); list_for_each_entry(req, &req_list, list) { if (nlh->nlmsg_seq != req->seq) continue; /* We set the DGID part, the rest was set earlier */ rdma_addr_set_dgid(req->addr, &gid); req->status = 0; found = 1; break; } spin_unlock_bh(&lock); if (!found) pr_info("Couldn't find request waiting for DGID: %pI6\n", &gid); } int ib_nl_handle_ip_res_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { if ((nlh->nlmsg_flags & NLM_F_REQUEST) || !(NETLINK_CB(skb).sk)) return -EPERM; if (ib_nl_is_good_ip_resp(nlh)) ib_nl_process_good_ip_rsep(nlh); return 0; } static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; struct rdma_ls_ip_resolve_header *header; void *data; size_t size; int attrtype; int len; if (family == AF_INET) { size = sizeof(struct in_addr); attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4; } else { size = sizeof(struct in6_addr); attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; } len = nla_total_size(sizeof(size)); len += NLMSG_ALIGN(sizeof(*header)); skb = nlmsg_new(len, GFP_KERNEL); if (!skb) return -ENOMEM; data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST); if (!data) { nlmsg_free(skb); return -ENODATA; } /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); header->ifindex = dev_addr->bound_dev_if; nla_put(skb, attrtype, size, daddr); /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. */ return -ENODATA; } int rdma_addr_size(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); case AF_IB: return sizeof(struct sockaddr_ib); default: return 0; } } EXPORT_SYMBOL(rdma_addr_size); int rdma_addr_size_in6(struct sockaddr_in6 *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_in6); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_kss); /** * rdma_copy_src_l2_addr - Copy netdevice source addresses * @dev_addr: Destination address pointer where to copy the addresses * @dev: Netdevice whose source addresses to copy * * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. * This includes unicast address, broadcast address, device type and * interface index. */ void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } EXPORT_SYMBOL(rdma_copy_src_l2_addr); static struct net_device * rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) { struct net_device *dev = NULL; int ret = -EADDRNOTAVAIL; switch (src_in->sa_family) { case AF_INET: dev = __ip_dev_find(net, ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, false); if (dev) ret = 0; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: for_each_netdev_rcu(net, dev) { if (ipv6_chk_addr(net, &((const struct sockaddr_in6 *)src_in)->sin6_addr, dev, 1)) { ret = 0; break; } } break; #endif } return ret ? ERR_PTR(ret) : dev; } int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { struct net_device *dev; if (dev_addr->bound_dev_if) { dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; rdma_copy_src_l2_addr(dev_addr, dev); dev_put(dev); return 0; } rcu_read_lock(); dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); if (!IS_ERR(dev)) rdma_copy_src_l2_addr(dev_addr, dev); rcu_read_unlock(); return PTR_ERR_OR_ZERO(dev); } EXPORT_SYMBOL(rdma_translate_ip); static void set_timeout(struct addr_req *req, unsigned long time) { unsigned long delay; delay = time - jiffies; if ((long)delay < 0) delay = 0; mod_delayed_work(addr_wq, &req->work, delay); } static void queue_req(struct addr_req *req) { spin_lock_bh(&lock); list_add_tail(&req->list, &req_list); set_timeout(req, req->timeout); spin_unlock_bh(&lock); } static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } static int dst_fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr) { struct neighbour *n; int ret = 0; n = dst_neigh_lookup(dst, daddr); if (!n) return -ENODATA; if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); ret = -ENODATA; } else { neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev); } neigh_release(n); return ret; } static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { struct rtable *rt; struct rt6_info *rt6; if (family == AF_INET) { rt = container_of(dst, struct rtable, dst); return rt->rt_uses_gateway; } rt6 = container_of(dst, struct rt6_info, dst); return rt6->rt6i_flags & RTF_GATEWAY; } static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const struct sockaddr *dst_in, u32 seq) { const struct sockaddr_in *dst_in4 = (const struct sockaddr_in *)dst_in; const struct sockaddr_in6 *dst_in6 = (const struct sockaddr_in6 *)dst_in; const void *daddr = (dst_in->sa_family == AF_INET) ? (const void *)&dst_in4->sin_addr.s_addr : (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; might_sleep(); /* If we have a gateway in IB mode then it must be an IB network */ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); } static int addr4_resolve(struct sockaddr *src_sock, const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct rtable **prt) { struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; const struct sockaddr_in *dst_in = (const struct sockaddr_in *)dst_sock; __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; struct flowi4 fl4; int ret; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst_ip; fl4.saddr = src_ip; fl4.flowi4_oif = addr->bound_dev_if; rt = ip_route_output_key(addr->net, &fl4); ret = PTR_ERR_OR_ZERO(rt); if (ret) return ret; src_in->sin_addr.s_addr = fl4.saddr; addr->hoplimit = ip4_dst_hoplimit(&rt->dst); *prt = rt; return 0; } #if IS_ENABLED(CONFIG_IPV6) static int addr6_resolve(struct sockaddr *src_sock, const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; const struct sockaddr_in6 *dst_in = (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; memset(&fl6, 0, sizeof fl6); fl6.daddr = dst_in->sin6_addr; fl6.saddr = src_in->sin6_addr; fl6.flowi6_oif = addr->bound_dev_if; dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); if (IS_ERR(dst)) return PTR_ERR(dst); if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; addr->hoplimit = ip6_dst_hoplimit(dst); *pdst = dst; return 0; } #else static int addr6_resolve(struct sockaddr *src_sock, const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { return -EADDRNOTAVAIL; } #endif static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, unsigned int ndev_flags, u32 seq) { int ret = 0; if (ndev_flags & IFF_LOOPBACK) { memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); } else { if (!(ndev_flags & IFF_NOARP)) { /* If the device doesn't do ARP internally */ ret = fetch_ha(dst, addr, dst_in, seq); } } return ret; } static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct sockaddr *dst_in, const struct dst_entry *dst, const struct net_device *ndev) { int ret = 0; if (dst->dev->flags & IFF_LOOPBACK) ret = rdma_translate_ip(dst_in, dev_addr); else rdma_copy_src_l2_addr(dev_addr, dst->dev); /* * If there's a gateway and type of device not ARPHRD_INFINIBAND, * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the * network type accordingly. */ if (has_gateway(dst, dst_in->sa_family) && ndev->type != ARPHRD_INFINIBAND) dev_addr->network = dst_in->sa_family == AF_INET ? RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6; else dev_addr->network = RDMA_NETWORK_IB; return ret; } static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, unsigned int *ndev_flags, const struct sockaddr *dst_in, const struct dst_entry *dst) { struct net_device *ndev = READ_ONCE(dst->dev); *ndev_flags = ndev->flags; /* A physical device must be the RDMA device to use */ if (ndev->flags & IFF_LOOPBACK) { /* * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or * loopback IP address. So if route is resolved to loopback * interface, translate that to a real ndev based on non * loopback IP address. */ ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); if (IS_ERR(ndev)) return -ENODEV; } return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); } static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) { struct net_device *ndev; ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); if (IS_ERR(ndev)) return PTR_ERR(ndev); /* * Since we are holding the rcu, reading net and ifindex * are safe without any additional reference; because * change_net_namespace() in net/core/dev.c does rcu sync * after it changes the state to IFF_DOWN and before * updating netdev fields {net, ifindex}. */ addr->net = dev_net(ndev); addr->bound_dev_if = ndev->ifindex; return 0; } static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) { addr->net = &init_net; addr->bound_dev_if = 0; } static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, bool resolve_by_gid_attr, u32 seq) { struct dst_entry *dst = NULL; unsigned int ndev_flags = 0; struct rtable *rt = NULL; int ret; if (!addr->net) { pr_warn_ratelimited("%s: missing namespace\n", __func__); return -EINVAL; } rcu_read_lock(); if (resolve_by_gid_attr) { if (!addr->sgid_attr) { rcu_read_unlock(); pr_warn_ratelimited("%s: missing gid_attr\n", __func__); return -EINVAL; } /* * If the request is for a specific gid attribute of the * rdma_dev_addr, derive net from the netdevice of the * GID attribute. */ ret = set_addr_netns_by_gid_rcu(addr); if (ret) { rcu_read_unlock(); return ret; } } if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); } if (ret) { rcu_read_unlock(); goto done; } ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); rcu_read_unlock(); /* * Resolve neighbor destination address if requested and * only if src addr translation didn't fail. */ if (!ret && resolve_neigh) ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); if (src_in->sa_family == AF_INET) ip_rt_put(rt); else dst_release(dst); done: /* * Clear the addr net to go back to its original state, only if it was * derived from GID attribute in this context. */ if (resolve_by_gid_attr) rdma_addr_set_net_defaults(addr); return ret; } static void process_one_req(struct work_struct *_work) { struct addr_req *req; struct sockaddr *src_in, *dst_in; req = container_of(_work, struct addr_req, work.work); if (req->status == -ENODATA) { src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, true, req->resolve_by_gid_attr, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { /* requeue the work for retrying again */ spin_lock_bh(&lock); if (!list_empty(&req->list)) set_timeout(req, req->timeout); spin_unlock_bh(&lock); return; } } req->callback(req->status, (struct sockaddr *)&req->src_addr, req->addr, req->context); req->callback = NULL; spin_lock_bh(&lock); /* * Although the work will normally have been canceled by the workqueue, * it can still be requeued as long as it is on the req_list. */ cancel_delayed_work(&req->work); if (!list_empty(&req->list)) { list_del_init(&req->list); kfree(req); } spin_unlock_bh(&lock); } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; int ret = 0; req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) { ret = -EINVAL; goto err; } memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); req->status = addr_resolve(src_in, dst_in, addr, true, req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; queue_req(req); break; case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); break; default: ret = req->status; goto err; } return ret; err: kfree(req); return ret; } EXPORT_SYMBOL(rdma_resolve_ip); int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr) { union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid, dgid; struct rdma_dev_addr dev_addr = {}; int ret; might_sleep(); if (rec->roce.route_resolved) return 0; rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid); rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid); if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) return -EINVAL; if (!attr || !attr->ndev) return -EINVAL; dev_addr.net = &init_net; dev_addr.sgid_attr = attr; ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid, &dev_addr, false, true, 0); if (ret) return ret; if ((dev_addr.network == RDMA_NETWORK_IPV4 || dev_addr.network == RDMA_NETWORK_IPV6) && rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) return -EINVAL; rec->roce.route_resolved = true; return 0; } /** * rdma_addr_cancel - Cancel resolve ip request * @addr: Pointer to address structure given previously * during rdma_resolve_ip(). * rdma_addr_cancel() is synchronous function which cancels any pending * request if there is any. */ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; struct addr_req *found = NULL; spin_lock_bh(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { /* * Removing from the list means we take ownership of * the req */ list_del_init(&req->list); found = req; break; } } spin_unlock_bh(&lock); if (!found) return; /* * sync canceling the work after removing it from the req_list * guarentees no work is running and none will be started. */ cancel_delayed_work_sync(&found->work); kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct completion comp; int status; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; union { struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; int ret; rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid); rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); dev_addr.net = &init_net; dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr, (struct sockaddr *)&dgid_addr, &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; wait_for_completion(&ctx.comp); ret = ctx.status; if (ret) return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); *hoplimit = dev_addr.hoplimit; return 0; } static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct addr_req *req; if (event == NETEVENT_NEIGH_UPDATE) { struct neighbour *neigh = ctx; if (neigh->nud_state & NUD_VALID) { spin_lock_bh(&lock); list_for_each_entry(req, &req_list, list) set_timeout(req, jiffies); spin_unlock_bh(&lock); } } return 0; } static struct notifier_block nb = { .notifier_call = netevent_callback }; int addr_init(void) { addr_wq = alloc_ordered_workqueue("ib_addr", 0); if (!addr_wq) return -ENOMEM; register_netevent_notifier(&nb); return 0; } void addr_cleanup(void) { unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); WARN_ON(!list_empty(&req_list)); }
3 15 1 1 2 2 2 1 1 2 9 18 2 5 11 11 1 1 1 1 1 1 1 15 1 2 8 6 8 1 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/capability.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter/nfnetlink_cthelper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); struct nfnl_cthelper { struct list_head list; struct nf_conntrack_helper helper; }; static LIST_HEAD(nfnl_cthelper_list); static int nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; struct nf_conntrack_helper *helper; help = nfct_help(ct); if (help == NULL) return NF_DROP; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper == NULL) return NF_DROP; /* This is a user-space helper not yet configured, skip. */ if ((helper->flags & (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == NF_CT_HELPER_F_USERSPACE) return NF_ACCEPT; /* If the user-space helper is not available, don't block traffic. */ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; } static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, }; static int nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_TUPLE_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) return -EINVAL; /* Not all fields are initialized so first zero the tuple */ memset(tuple, 0, sizeof(struct nf_conntrack_tuple)); tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); return 0; } static int nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; if (attr == NULL) return -EINVAL; helper = rcu_dereference(help->helper); if (!helper || helper->data_len == 0) return -EINVAL; nla_memcpy(help->data, attr, sizeof(help->data)); return 0; } static int nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) { const struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); if (helper && helper->data_len && nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data)) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, }; static int nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_POLICY_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; expect_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static const struct nla_policy nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, }; static int nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { int i, ret; struct nf_conntrack_expect_policy *expect_policy; struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; unsigned int class_max; ret = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (ret < 0) return ret; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (class_max == 0) return -EINVAL; if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; expect_policy = kcalloc(class_max, sizeof(struct nf_conntrack_expect_policy), GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; for (i = 0; i < class_max; i++) { if (!tb[NFCTH_POLICY_SET+i]) goto err; ret = nfnl_cthelper_expect_policy(&expect_policy[i], tb[NFCTH_POLICY_SET+i]); if (ret < 0) goto err; } helper->expect_class_max = class_max - 1; helper->expect_policy = expect_policy; return 0; err: kfree(expect_policy); return -EINVAL; } static int nfnl_cthelper_create(const struct nlattr * const tb[], struct nf_conntrack_tuple *tuple) { struct nf_conntrack_helper *helper; struct nfnl_cthelper *nfcth; unsigned int size; int ret; if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) return -EINVAL; nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL); if (nfcth == NULL) return -ENOMEM; helper = &nfcth->helper; ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) goto err1; nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { ret = -ENOMEM; goto err2; } helper->data_len = size; helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); helper->me = THIS_MODULE; helper->help = nfnl_userspace_cthelper; helper->from_nlattr = nfnl_cthelper_from_nlattr; helper->to_nlattr = nfnl_cthelper_to_nlattr; /* Default to queue number zero, this can be updated at any time. */ if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } ret = nf_conntrack_helper_register(helper); if (ret < 0) goto err2; list_add_tail(&nfcth->list, &nfnl_cthelper_list); return 0; err2: kfree(helper->expect_policy); err1: kfree(nfcth); return ret; } static int nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, struct nf_conntrack_expect_policy *new_policy, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name)) return -EBUSY; new_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (new_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; new_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_helper *helper) { struct nf_conntrack_expect_policy *new_policy; struct nf_conntrack_expect_policy *policy; int i, ret = 0; new_policy = kmalloc_array(helper->expect_class_max + 1, sizeof(*new_policy), GFP_KERNEL); if (!new_policy) return -ENOMEM; /* Check first that all policy attributes are well-formed, so we don't * leave things in inconsistent state on errors. */ for (i = 0; i < helper->expect_class_max + 1; i++) { if (!tb[NFCTH_POLICY_SET + i]) { ret = -EINVAL; goto err; } ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], &new_policy[i], tb[NFCTH_POLICY_SET + i]); if (ret < 0) goto err; } /* Now we can safely update them. */ for (i = 0; i < helper->expect_class_max + 1; i++) { policy = (struct nf_conntrack_expect_policy *) &helper->expect_policy[i]; policy->max_expected = new_policy->max_expected; policy->timeout = new_policy->timeout; } err: kfree(new_policy); return ret; } static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1]; unsigned int class_max; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (helper->expect_class_max + 1 != class_max) return -EBUSY; return nfnl_cthelper_update_policy_all(tb, helper); } static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { u32 size; int ret; if (tb[NFCTH_PRIV_DATA_LEN]) { size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size != helper->data_len) return -EBUSY; } if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) return ret; } if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } return 0; } static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; int ret = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; helper_name = nla_data(tb[NFCTH_NAME]); ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if ((tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; break; } if (helper == NULL) ret = nfnl_cthelper_create(tb, &tuple); else ret = nfnl_cthelper_update(tb, helper); return ret; } static int nfnl_cthelper_dump_tuple(struct sk_buff *skb, struct nf_conntrack_helper *helper) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, NFCTH_TUPLE); if (nest_parms == NULL) goto nla_put_failure; if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, htons(helper->tuple.src.l3num))) goto nla_put_failure; if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_dump_policy(struct sk_buff *skb, struct nf_conntrack_helper *helper) { int i; struct nlattr *nest_parms1, *nest_parms2; nest_parms1 = nla_nest_start(skb, NFCTH_POLICY); if (nest_parms1 == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, htonl(helper->expect_class_max + 1))) goto nla_put_failure; for (i = 0; i < helper->expect_class_max + 1; i++) { nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET + i)); if (nest_parms2 == NULL) goto nla_put_failure; if (nla_put_string(skb, NFCTH_POLICY_NAME, helper->expect_policy[i].name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, htonl(helper->expect_policy[i].max_expected))) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, htonl(helper->expect_policy[i].timeout))) goto nla_put_failure; nla_nest_end(skb, nest_parms2); } nla_nest_end(skb, nest_parms1); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) goto nla_put_failure; if (nfnl_cthelper_dump_tuple(skb, helper) < 0) goto nla_put_failure; if (nfnl_cthelper_dump_policy(skb, helper) < 0) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) goto nla_put_failure; if (helper->flags & NF_CT_HELPER_F_CONFIGURED) status = NFCT_HELPER_STATUS_ENABLED; else status = NFCT_HELPER_STATUS_DISABLED; if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_conntrack_helper *cur, *last; rcu_read_lock(); last = (struct nf_conntrack_helper *)cb->args[1]; for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[cb->args[0]], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) continue; if (cb->args[1]) { if (cur != last) continue; cb->args[1] = 0; } if (nfnl_cthelper_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; goto out; } } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } out: rcu_read_unlock(); return skb->len; } static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; struct sk_buff *skb2; char *helper_name = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; bool tuple_set = false; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; struct nf_conntrack_tuple tuple; bool tuple_set = false, found = false; struct nfnl_cthelper *nlcth, *n; int j = 0, ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (refcount_dec_if_one(&cur->refcnt)) { found = true; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); list_del(&nlcth->list); kfree(nlcth); } else { ret = -EBUSY; } } /* Make sure we return success if we flush and there is no helpers */ return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { .name = "cthelper", .subsys_id = NFNL_SUBSYS_CTHELPER, .cb_count = NFNL_MSG_CTHELPER_MAX, .cb = nfnl_cthelper_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); static int __init nfnl_cthelper_init(void) { int ret; ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); if (ret < 0) { pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); goto err_out; } return 0; err_out: return ret; } static void __exit nfnl_cthelper_exit(void) { struct nf_conntrack_helper *cur; struct nfnl_cthelper *nlcth, *n; nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); kfree(nlcth); } } module_init(nfnl_cthelper_init); module_exit(nfnl_cthelper_exit);
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 // SPDX-License-Identifier: GPL-2.0 /* net/atm/pvc.c - ATM PVC sockets */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #include <linux/net.h> /* struct socket, struct proto_ops */ #include <linux/atm.h> /* ATM stuff */ #include <linux/atmdev.h> /* ATM devices */ #include <linux/errno.h> /* error codes */ #include <linux/kernel.h> /* printk */ #include <linux/init.h> #include <linux/skbuff.h> #include <linux/bitops.h> #include <linux/export.h> #include <net/sock.h> /* for sock_no_* */ #include "resources.h" /* devs and vccs */ #include "common.h" /* common for PVCs and SVCs */ static int pvc_shutdown(struct socket *sock, int how) { return 0; } static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_atmpvc *addr; struct atm_vcc *vcc; int error; if (sockaddr_len != sizeof(struct sockaddr_atmpvc)) return -EINVAL; addr = (struct sockaddr_atmpvc *)sockaddr; if (addr->sap_family != AF_ATMPVC) return -EAFNOSUPPORT; lock_sock(sk); vcc = ATM_SD(sock); if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) { if (vcc->vpi != ATM_VPI_UNSPEC) addr->sap_addr.vpi = vcc->vpi; if (vcc->vci != ATM_VCI_UNSPEC) addr->sap_addr.vci = vcc->vci; } error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi, addr->sap_addr.vci); out: release_sock(sk); return error; } static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { return pvc_bind(sock, sockaddr, sockaddr_len); } static int pvc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int error; lock_sock(sk); error = vcc_setsockopt(sock, level, optname, optval, optlen); release_sock(sk); return error; } static int pvc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int error; lock_sock(sk); error = vcc_getsockopt(sock, level, optname, optval, optlen); release_sock(sk); return error; } static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, int peer) { struct sockaddr_atmpvc *addr; struct atm_vcc *vcc = ATM_SD(sock); if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; addr = (struct sockaddr_atmpvc *)sockaddr; memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; addr->sap_addr.vci = vcc->vci; return sizeof(struct sockaddr_atmpvc); } static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, .release = vcc_release, .bind = pvc_bind, .connect = pvc_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, #endif .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = pvc_shutdown, .setsockopt = pvc_setsockopt, .getsockopt = pvc_getsockopt, .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, }; static int pvc_create(struct net *net, struct socket *sock, int protocol, int kern) { if (net != &init_net) return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { .family = PF_ATMPVC, .create = pvc_create, .owner = THIS_MODULE, }; /* * Initialize the ATM PVC protocol family */ int __init atmpvc_init(void) { return sock_register(&pvc_family_ops); } void atmpvc_exit(void) { sock_unregister(PF_ATMPVC); }
60 56 1 1 15 14 2 2 61 64 64 62 6 6 18 19 1 1 1 1 162 164 2 29 196 196 46 53 11 1 1 4 5 3 2 2 3 2 3 3 3 3 3 4 4 4 3 18 1 1 2 12 2 6 1 1 2 1 1 8 1 1 1 3 1 1 1 1 10 3 6 1 3 6 7 7 6 1 1 2 1 1 1 2 2 8 1 1 2 2 1 1 5 1 8 8 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
3 3 18138 18541 18175 18142 452 459 455 190 271 15 16 257 255 190 190 190 611 8 605 598 604 3161 3175 911 908 3474 3622 12 3291 37 179 214 212 3 3 946 969 1372 1368 1376 995 993 1000 31 4 26 30 31 31 712 718 718 568 243 335 2 565 569 564 914 924 918 926 923 767 763 762 760 769 133 134 138 132 153 153 151 153 152 17073 17057 17090 17027 17092 15643 1796 274 272 274 271 71 57 7146 7171 7171 7190 2597 4652 150 10906 10910 1 1 12045 59 11 50 89 89 1416 1414 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; tracee = __begin_current_label_crit_section(); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; rules = list_first_entry(&profile->rules, typeof(*rules), list); *effective = cap_intersect(*effective, rules->caps.allow); *permitted = cap_intersect(*permitted, rules->caps.allow); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label(file->f_cred); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label(label); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; /* don't reaudit files closed during inheritance */ if (file->f_path.dentry == aa_null.dentry) return -EACCES; label = __begin_current_label_crit_section(); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = list_first_entry(&profile->rules, typeof(*rules), list); state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, u32 *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getsecid_subj(u32 *secid) { struct aa_label *label = __begin_current_label_crit_section(); *secid = label->secid; __end_current_label_crit_section(label); } static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) { struct aa_label *label = aa_get_task_label(p); *secid = label->secid; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label = __begin_current_label_crit_section(); int error = 0; if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) { struct aa_sk_ctx *ctx; ctx = kzalloc(sizeof(*ctx), flags); if (!ctx) return -ENOMEM; sk->sk_security = ctx; return 0; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); sk->sk_security = NULL; aa_put_label(ctx->label); aa_put_label(ctx->peer); kfree(ctx); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); if (new->label) aa_put_label(new->label); new->label = aa_get_label(ctx->label); if (new->peer) aa_put_label(new->peer); new->peer = aa_get_label(ctx->peer); } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); label = begin_current_label_crit_section(); if (!(kern || unconfined(label))) error = af_select(family, create_perm(label, family, type, protocol), aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol)); end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @ptotocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); aa_put_label(ctx->label); ctx->label = aa_get_label(label); } aa_put_label(label); return 0; } static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, bind_perm(sock, address, addrlen), aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, connect_perm(sock, address, addrlen), aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, listen_perm(sock, backlog), aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, accept_perm(sock, newsock), aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, msg_perm(op, request, sock, msg, size), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, sock_perm(op, request, sock), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, opt_perm(op, request, sock, level, optname), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } #endif static struct aa_label *sk_peer_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); if (ctx->peer) return ctx->peer; return ERR_PTR(-ENOPROTOOPT); } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; label = begin_current_label_crit_section(); peer = sk_peer_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done: end_current_label_crit_section(label); kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!ctx->label) ctx->label = aa_get_current_label(); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremly high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { } }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk)) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } __initcall(apparmor_nf_ip_init); #endif static char nulldfa_src[] = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .name = "apparmor", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, };
5 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <asm/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_byteorder { u8 sreg; u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; }; void nft_byteorder_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = &regs->data[priv->sreg]; u32 *dst = &regs->data[priv->dreg]; u16 *s16, *d16; unsigned int i; s16 = (void *)src; d16 = (void *)dst; switch (priv->size) { case 8: { u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); nft_reg_store64(&dst64[i], src64); } break; } break; } case 4: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) dst[i] = (__force __u32)htonl(src[i]); break; } break; case 2: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) d16[i] = (__force __u16)htons(s16[i]); break; } break; } } static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || tb[NFTA_BYTEORDER_DREG] == NULL || tb[NFTA_BYTEORDER_LEN] == NULL || tb[NFTA_BYTEORDER_SIZE] == NULL || tb[NFTA_BYTEORDER_OP] == NULL) return -EINVAL; priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); switch (priv->op) { case NFT_BYTEORDER_NTOH: case NFT_BYTEORDER_HTON: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); if (err < 0) return err; priv->size = size; switch (priv->size) { case 2: case 4: case 8: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_BYTEORDER_SREG, priv->sreg)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_BYTEORDER_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_byteorder_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_byteorder *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, priv->len); return false; } static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { .name = "byteorder", .ops = &nft_byteorder_ops, .policy = nft_byteorder_policy, .maxattr = NFTA_BYTEORDER_MAX, .owner = THIS_MODULE, };
115 11939 589 523 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H #include <linux/kernel.h> #include <linux/preempt.h> #include <linux/atomic.h> #include <linux/bug.h> /* * bit-based spin_lock() * * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ static inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters * the body of the outer loop. If it is contended, then * within the inner loop a non-atomic test is used to * busywait with less bus contention for a good time to * attempt to acquire the lock bit. */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); do { cpu_relax(); } while (test_bit(bitnum, addr)); preempt_disable(); } #endif __acquire(bitlock); } /* * Return true if it was acquired */ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) if (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); return 0; } #endif __acquire(bitlock); return 1; } /* * bit-based spin_unlock() */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * bit-based spin_unlock() * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) __clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * Return true if the lock is held. */ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); #elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; #endif } #endif /* __LINUX_BIT_SPINLOCK_H */
9 9 2 1 2 2 1 2 2 10 10 2 10 2 10 7 6 3 3 3 3 3 3 6 5 4 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <linux/rcupdate_wait.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c KEY POINTS: - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled - kthreads read the cpustats to update the estimators (svcs, dests, total) - the states of estimators can be read (get stats) or modified (zero stats) from processes KTHREADS: - estimators are added initially to est_temp_list and later kthread 0 distributes them to one or many kthreads for estimation - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated - when configuration (cpulist/nice) is changed, the tasks are restarted by work (est_reload_work) - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread them to multiple chains which are estimated at different time - on start, kthread 0 enters calculation phase to determine the chain limits and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data */ static struct lock_class_key __ipvs_est_key; static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; hlist_for_each_entry_rcu(e, chain, list) { u64 conns, inpkts, outpkts, inbytes, outbytes; u64 kconns = 0, kinpkts = 0, koutpkts = 0; u64 kinbytes = 0, koutbytes = 0; unsigned int start; int i; if (kthread_should_stop()) break; s = container_of(e, struct ip_vs_stats, est); for_each_possible_cpu(i) { c = per_cpu_ptr(s->cpustats, i); do { start = u64_stats_fetch_begin(&c->syncp); conns = u64_stats_read(&c->cnt.conns); inpkts = u64_stats_read(&c->cnt.inpkts); outpkts = u64_stats_read(&c->cnt.outpkts); inbytes = u64_stats_read(&c->cnt.inbytes); outbytes = u64_stats_read(&c->cnt.outbytes); } while (u64_stats_fetch_retry(&c->syncp, start)); kconns += conns; kinpkts += inpkts; koutpkts += outpkts; kinbytes += inbytes; koutbytes += outbytes; } spin_lock(&s->lock); s->kstats.conns = kconns; s->kstats.inpkts = kinpkts; s->kstats.outpkts = koutpkts; s->kstats.inbytes = kinbytes; s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } } static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { struct ip_vs_est_tick_data *td; int cid; rcu_read_lock(); td = rcu_dereference(kd->ticks[row]); if (!td) goto out; for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { if (kthread_should_stop()) break; ip_vs_chain_estimation(&td->chains[cid]); cond_resched_rcu(); td = rcu_dereference(kd->ticks[row]); if (!td) break; } out: rcu_read_unlock(); } static int ip_vs_estimation_kthread(void *data) { struct ip_vs_est_kt_data *kd = data; struct netns_ipvs *ipvs = kd->ipvs; int row = kd->est_row; unsigned long now; int id = kd->id; long gap; if (id > 0) { if (!ipvs->est_chain_max) return 0; } else { if (!ipvs->est_chain_max) { ipvs->est_calc_phase = 1; /* commit est_calc_phase before reading est_genid */ smp_mb(); } /* kthread 0 will handle the calc phase */ if (ipvs->est_calc_phase) ip_vs_est_calc_phase(ipvs); } while (1) { if (!id && !hlist_empty(&ipvs->est_temp_list)) ip_vs_est_drain_temp_list(ipvs); set_current_state(TASK_IDLE); if (kthread_should_stop()) break; /* before estimation, check if we should sleep */ now = jiffies; gap = kd->est_timer - now; if (gap > 0) { if (gap > IPVS_EST_TICK) { kd->est_timer = now - IPVS_EST_TICK; gap = IPVS_EST_TICK; } schedule_timeout(gap); } else { __set_current_state(TASK_RUNNING); if (gap < -8 * IPVS_EST_TICK) kd->est_timer = now; } if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; if (row >= IPVS_EST_NTICKS) row = 0; WRITE_ONCE(kd->est_row, row); kd->est_timer += IPVS_EST_TICK; } __set_current_state(TASK_RUNNING); return 0; } /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ if (!ipvs->enable) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } /* Start kthread task with current configuration */ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { unsigned long now; int ret = 0; long gap; lockdep_assert_held(&ipvs->est_mutex); if (kd->task) goto out; now = jiffies; gap = kd->est_timer - now; /* Sync est_timer if task is starting later */ if (abs(gap) > 4 * IPVS_EST_TICK) kd->est_timer = now; kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", ipvs->gen, kd->id); if (IS_ERR(kd->task)) { ret = PTR_ERR(kd->task); kd->task = NULL; goto out; } set_user_nice(kd->task, sysctl_est_nice(ipvs)); set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); out: return ret; } void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) { if (kd->task) { pr_info("stopping estimator thread %d...\n", kd->id); kthread_stop(kd->task); kd->task = NULL; } } /* Apply parameters to kthread */ static void ip_vs_est_set_params(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { kd->chain_max = ipvs->est_chain_max; /* We are using single chain on RCU preemption */ if (IPVS_EST_TICK_CHAINS == 1) kd->chain_max *= IPVS_EST_CHAIN_FACTOR; kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; } /* Create and start estimation kthread in a free or new array slot */ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) { struct ip_vs_est_kt_data *kd = NULL; int id = ipvs->est_kt_count; int ret = -ENOMEM; void *arr = NULL; int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && ipvs->enable && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; } if (i >= id) { arr = krealloc_array(ipvs->est_kt_arr, id + 1, sizeof(struct ip_vs_est_kt_data *), GFP_KERNEL); if (!arr) goto out; ipvs->est_kt_arr = arr; } else { id = i; } kd = kzalloc(sizeof(*kd), GFP_KERNEL); if (!kd) goto out; kd->ipvs = ipvs; bitmap_fill(kd->avail, IPVS_EST_NTICKS); kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { kd->calc_stats = ip_vs_stats_alloc(); if (!kd->calc_stats) goto out; } /* Start kthread tasks only when services are present */ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; } if (arr) ipvs->est_kt_count++; ipvs->est_kt_arr[id] = kd; kd = NULL; /* Use most recent kthread for new ests */ ipvs->est_add_ktid = id; ret = 0; out: mutex_unlock(&ipvs->est_mutex); if (kd) { ip_vs_stats_free(kd->calc_stats); kfree(kd); } return ret; } /* Select ktid where to add new ests: available, unused or new slot */ static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) { int ktid, best = ipvs->est_kt_count; struct ip_vs_est_kt_data *kd; for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { kd = ipvs->est_kt_arr[ktid]; if (kd) { if (kd->est_count < kd->est_max_count) { best = ktid; break; } } else if (ktid < best) { best = ktid; } } ipvs->est_add_ktid = best; } /* Add estimator to current kthread (est_add_ktid) */ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, struct ip_vs_estimator *est) { struct ip_vs_est_kt_data *kd = NULL; struct ip_vs_est_tick_data *td; int ktid, row, crow, cid, ret; int delay = est->ktrow; BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, "Too many chains for ktcid"); if (ipvs->est_add_ktid < ipvs->est_kt_count) { kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; if (kd) goto add_est; } ret = ip_vs_est_add_kthread(ipvs); if (ret < 0) goto out; kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; add_est: ktid = kd->id; /* For small number of estimators prefer to use few ticks, * otherwise try to add into the last estimated row. * est_row and add_row point after the row we should use */ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) crow = READ_ONCE(kd->est_row); else crow = kd->add_row; crow += delay; if (crow >= IPVS_EST_NTICKS) crow -= IPVS_EST_NTICKS; /* Assume initial delay ? */ if (delay >= IPVS_EST_NTICKS - 1) { /* Preserve initial delay or decrease it if no space in tick */ row = crow; if (crow < IPVS_EST_NTICKS - 1) { crow++; row = find_last_bit(kd->avail, crow); } if (row >= crow) row = find_last_bit(kd->avail, IPVS_EST_NTICKS); } else { /* Preserve delay or increase it if no space in tick */ row = IPVS_EST_NTICKS; if (crow > 0) row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); if (row >= IPVS_EST_NTICKS) row = find_first_bit(kd->avail, IPVS_EST_NTICKS); } td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { td = kzalloc(sizeof(*td), GFP_KERNEL); if (!td) { ret = -ENOMEM; goto out; } rcu_assign_pointer(kd->ticks[row], td); } cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); kd->est_count++; kd->tick_len[row]++; if (!td->chain_len[cid]) __set_bit(cid, td->present); td->chain_len[cid]++; est->ktid = ktid; est->ktrow = row; est->ktcid = cid; hlist_add_head_rcu(&est->list, &td->chains[cid]); if (td->chain_len[cid] >= kd->chain_max) { __set_bit(cid, td->full); if (kd->tick_len[row] >= kd->tick_max) __clear_bit(row, kd->avail); } /* Update est_add_ktid to point to first available/empty kt slot */ if (kd->est_count == kd->est_max_count) ip_vs_est_update_ktid(ipvs); ret = 0; out: return ret; } /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; int ret; if (!ipvs->est_max_threads && ipvs->enable) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ /* We prefer this code to be short, kthread 0 will requeue the * estimator to available chain. If tasks are disabled, we * will not allocate much memory, just for kt 0. */ ret = 0; if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) ret = ip_vs_est_add_kthread(ipvs); if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else INIT_HLIST_NODE(&est->list); return ret; } static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) { if (kd) { if (kd->task) { pr_info("stop unused estimator thread %d...\n", kd->id); kthread_stop(kd->task); } ip_vs_stats_free(kd->calc_stats); kfree(kd); } } /* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; int ktid = est->ktid; int row = est->ktrow; int cid = est->ktcid; /* Failed to add to chain ? */ if (hlist_unhashed(&est->list)) return; /* On return, estimator can be freed, dequeue it now */ /* In est_temp_list ? */ if (ktid < 0) { hlist_del(&est->list); goto end_kt0; } hlist_del_rcu(&est->list); kd = ipvs->est_kt_arr[ktid]; td = rcu_dereference_protected(kd->ticks[row], 1); __clear_bit(cid, td->full); td->chain_len[cid]--; if (!td->chain_len[cid]) __clear_bit(cid, td->present); kd->tick_len[row]--; __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { /* This kt slot can become available just now, prefer it */ if (ktid < ipvs->est_add_ktid) ipvs->est_add_ktid = ktid; return; } if (ktid > 0) { mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { ipvs->est_kt_count--; while (ipvs->est_kt_count > 1 && !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) ip_vs_est_update_ktid(ipvs); } end_kt0: /* kt 0 is freed after all other kthreads and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } } /* Register all ests from est_temp_list to kthreads */ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) { struct ip_vs_estimator *est; while (1) { int max = 16; mutex_lock(&__ip_vs_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { if (kthread_should_stop()) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) continue; est->ktid = -1; hlist_add_head(&est->list, &ipvs->est_temp_list); /* Abort, some entries will not be estimated * until next attempt */ } goto unlock; } mutex_unlock(&__ip_vs_mutex); cond_resched(); } unlock: mutex_unlock(&__ip_vs_mutex); } /* Calculate limits for all kthreads */ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct ip_vs_est_kt_data *kd; struct hlist_head chain; struct ip_vs_stats *s; int cache_factor = 4; int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; int max = 8; int ret = 1; s64 diff; u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); kd = ipvs->est_kt_arr[0]; mutex_unlock(&__ip_vs_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; hlist_add_head(&s->est.list, &chain); loops = 1; /* Get best result from many tests */ for (ntest = 0; ntest < 12; ntest++) { if (!(ntest & 3)) { /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); if (!ipvs->enable || kthread_should_stop()) goto stop; } local_bh_disable(); rcu_read_lock(); /* Put stats in cache */ ip_vs_chain_estimation(&chain); t1 = ktime_get(); for (i = loops * cache_factor; i > 0; i--) ip_vs_chain_estimation(&chain); t2 = ktime_get(); rcu_read_unlock(); local_bh_enable(); if (!ipvs->enable || kthread_should_stop()) goto stop; cond_resched(); diff = ktime_to_ns(ktime_sub(t2, t1)); if (diff <= 1 * NSEC_PER_USEC) { /* Do more loops on low time resolution */ loops *= 2; continue; } if (diff >= NSEC_PER_SEC) continue; val = diff; do_div(val, loops); if (!min_est || val < min_est) { min_est = val; /* goal: 95usec per chain */ val = 95 * NSEC_PER_USEC; if (val >= min_est) { do_div(val, min_est); max = (int)val; } else { max = 1; } } } out: if (s) hlist_del_init(&s->est.list); *chain_max = max; return ret; stop: ret = 0; goto out; } /* Calculate the parameters and apply them in context of kt #0 * ECP: est_calc_phase * ECM: est_chain_max * ECP ECM Insert Chain enable Description * --------------------------------------------------------------------------- * 0 0 est_temp_list 0 create kt #0 context * 0 0 est_temp_list 0->1 service added, start kthread #0 task * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase * 1 0 est_temp_list 1 kt #0: determine est_chain_max, * stop tasks, move ests to est_temp_list * and free kd for kthreads 1..last * 1->0 0->N kt chains 1 ests can go to kthreads * 0 N kt chains 1 drain est_temp_list, create new kthread * contexts, start tasks, estimate */ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) { int genid = atomic_read(&ipvs->est_genid); struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; struct ip_vs_estimator *est; struct ip_vs_stats *stats; int id, row, cid, delay; bool last, last_td; int chain_max; int step; if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; mutex_lock(&__ip_vs_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ if (!ipvs->enable) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while * we reschedule. Even for kthread 0. */ step = 0; /* Order entries in est_temp_list in ascending delay, so now * walk delay(desc), id(desc), cid(asc) */ delay = IPVS_EST_NTICKS; next_delay: delay--; if (delay < 0) goto end_dequeue; last_kt: /* Destroy contexts backwards */ id = ipvs->est_kt_count; next_kt: if (!ipvs->enable || kthread_should_stop()) goto unlock; id--; if (id < 0) goto next_delay; kd = ipvs->est_kt_arr[id]; if (!kd) goto next_kt; /* kt 0 can exist with empty chains */ if (!id && kd->est_count <= 1) goto next_delay; row = kd->est_row + delay; if (row >= IPVS_EST_NTICKS) row -= IPVS_EST_NTICKS; td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) goto next_kt; cid = 0; walk_chain: if (kthread_should_stop()) goto unlock; step++; if (!(step & 63)) { /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ mutex_unlock(&__ip_vs_mutex); cond_resched(); mutex_lock(&__ip_vs_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) goto last_kt; if (kd != ipvs->est_kt_arr[id]) goto next_kt; /* Current td released ? */ if (td != rcu_dereference_protected(kd->ticks[row], 1)) goto next_kt; /* No fatal changes on the current kd and td */ } est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, list); if (!est) { cid++; if (cid >= IPVS_EST_TICK_CHAINS) goto next_kt; goto walk_chain; } /* We can cheat and increase est_count to protect kt 0 context * from release but we prefer to keep the last estimator */ last = kd->est_count <= 1; /* Do not free kt #0 data */ if (!id && last) goto next_delay; last_td = kd->tick_len[row] <= 1; stats = container_of(est, struct ip_vs_stats, est); ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; est->ktrow = row - kd->est_row; if (est->ktrow < 0) est->ktrow += IPVS_EST_NTICKS; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) goto next_kt; /* td freed ? */ if (last_td) goto next_kt; goto walk_chain; end_dequeue: /* All estimators removed while calculating ? */ if (!ipvs->est_kt_count) goto unlock; kd = ipvs->est_kt_arr[0]; if (!kd) goto unlock; kd->add_row = kd->est_row; ipvs->est_chain_max = chain_max; ip_vs_est_set_params(ipvs, kd); pr_info("using max %d ests per chain, %d per kthread\n", kd->chain_max, kd->est_max_count); /* Try to keep tot_stats in kt0, enqueue it early */ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && ipvs->tot_stats->s.est.ktid == -1) { hlist_del(&ipvs->tot_stats->s.est.list); hlist_add_head(&ipvs->tot_stats->s.est.list, &ipvs->est_temp_list); } mutex_lock(&ipvs->est_mutex); /* We completed the calc phase, new calc phase not requested */ if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; unlock2: mutex_unlock(&ipvs->est_mutex); unlock: mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_HLIST_HEAD(&ipvs->est_temp_list); ipvs->est_kt_arr = NULL; ipvs->est_max_threads = 0; ipvs->est_calc_phase = 0; ipvs->est_chain_max = 0; ipvs->est_kt_count = 0; ipvs->est_add_ktid = 0; atomic_set(&ipvs->est_genid, 0); atomic_set(&ipvs->est_genid_done, 0); __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { int i; for (i = 0; i < ipvs->est_kt_count; i++) ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); kfree(ipvs->est_kt_arr); mutex_destroy(&ipvs->est_mutex); }
87 87 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge options must be added with netlink support only * please do not add new sysfs entries */ #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* * Common code for storing bridge parameters. */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, int (*set)(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); struct netlink_ext_ack extack = {0}; unsigned long val; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &val); if (err != 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); if (extack._msg) { if (err) br_err(br, "%s\n", extack._msg); else br_warn(br, "%s\n", extack._msg); } rtnl_unlock(); return err ? err : len; } static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_forward_delay(br, val); } static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_hello_time(br, val); } static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_max_age(br, val); } static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } static DEVICE_ATTR_RW(ageing_time); static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->stp_enabled); } static int set_stp_state(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); static ssize_t group_fwd_mask_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; return 0; } static ssize_t group_fwd_mask_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; } static ssize_t priority_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } static DEVICE_ATTR_RW(priority); static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } static DEVICE_ATTR_RO(root_id); static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); spin_lock_bh(&br->lock); ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); rtnl_unlock(); return len; } static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); return 0; } static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); static ssize_t no_linklocal_learn_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_no_linklocal_learn); } static DEVICE_ATTR_RW(no_linklocal_learn); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); static ssize_t multicast_query_use_ifaddr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } static ssize_t multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } static DEVICE_ATTR_RW(multicast_query_use_ifaddr); static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { /* 16 is RHT_ELASTICITY */ NL_SET_ERR_MSG_MOD(extack, "the hash_elasticity option has been deprecated and is always 16"); return 0; } static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } static DEVICE_ATTR_RW(hash_elasticity); static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->hash_max = val; return 0; } static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); static ssize_t multicast_igmp_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_count = val; return 0; } static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } static DEVICE_ATTR_RW(multicast_last_member_count); static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_startup_query_count = val; return 0; } static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } static DEVICE_ATTR_RW(multicast_startup_query_count); static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } static DEVICE_ATTR_RW(multicast_last_member_interval); static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } static DEVICE_ATTR_RW(multicast_membership_interval); static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } static DEVICE_ATTR_RW(multicast_querier_interval); static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } static DEVICE_ATTR_RW(multicast_query_interval); static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } static DEVICE_ATTR_RW(multicast_query_response_interval); static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); static ssize_t multicast_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } static ssize_t multicast_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); #if IS_ENABLED(CONFIG_IPV6) static ssize_t multicast_mld_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } static DEVICE_ATTR_RW(nf_call_iptables); static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } static DEVICE_ATTR_RW(nf_call_ip6tables); static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } static DEVICE_ATTR_RW(vlan_filtering); static ssize_t vlan_protocol_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); static ssize_t default_pvid_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); static ssize_t vlan_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats(br, val); } static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); static ssize_t vlan_stats_per_port_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats_per_port(br, val); } static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, &dev_attr_hello_time.attr, &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, &dev_attr_group_fwd_mask.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, &dev_attr_root_path_cost.attr, &dev_attr_root_port.attr, &dev_attr_topology_change.attr, &dev_attr_topology_change_detected.attr, &dev_attr_hello_timer.attr, &dev_attr_tcn_timer.attr, &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, &dev_attr_multicast_startup_query_count.attr, &dev_attr_multicast_last_member_interval.attr, &dev_attr_multicast_membership_interval.attr, &dev_attr_multicast_querier_interval.attr, &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, #if IS_ENABLED(CONFIG_IPV6) &dev_attr_multicast_mld_version.attr, #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, &dev_attr_vlan_stats_per_port.attr, #endif NULL }; static const struct attribute_group bridge_group = { .name = SYSFS_BRIDGE_ATTR, .attrs = bridge_attrs, }; /* * Export the forwarding information table as a binary file * The records are struct __fdb_entry. * * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; /* must read whole records */ if (off % sizeof(struct __fdb_entry) != 0) return -EINVAL; n = br_fdb_fillbuf(br, buf, count / sizeof(struct __fdb_entry), off / sizeof(struct __fdb_entry)); if (n > 0) n *= sizeof(struct __fdb_entry); return n; } static struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, .read = brforward_read, }; /* * Add entries in sysfs onto the existing network class device * for the bridge. * Adds a attribute group "bridge" containing tuning parameters. * Binary attribute containing the forward table * Sub directory to hold links to interfaces. * * Note: the ifobj exists only to be a subdirectory * to hold links. The ifobj exists in same data structure * as it's parent the bridge so reference counting works. */ int br_sysfs_addbr(struct net_device *dev) { struct kobject *brobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); int err; err = sysfs_create_group(brobj, &bridge_group); if (err) { pr_info("%s: can't create group %s/%s\n", __func__, dev->name, bridge_group.name); goto out1; } err = sysfs_create_bin_file(brobj, &bridge_forward); if (err) { pr_info("%s: can't create attribute file %s/%s\n", __func__, dev->name, bridge_forward.attr.name); goto out2; } br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); err = -ENOMEM; goto out3; } return 0; out3: sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); out2: sysfs_remove_group(&dev->dev.kobj, &bridge_group); out1: return err; } void br_sysfs_delbr(struct net_device *dev) { struct kobject *kobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); kobject_put(br->ifobj); sysfs_remove_bin_file(kobj, &bridge_forward); sysfs_remove_group(kobj, &bridge_group); }
5 143 143 148 308 309 179 303 6 186 8 8 66 68 179 187 185 2 104 104 2 34 97 180 269 263 123 142 15 53 59 115 104 108 91 103 113 107 4 1 109 13 5 96 94 94 115 115 119 116 5 92 22 111 4 96 94 79 93 90 66 53 79 27 57 40 95 2 108 34 10 23 33 113 1 117 111 115 118 116 115 119 5 64 42 106 1 104 6 2 13 22 21 55 1 48 3 99 76 1 33 110 109 108 196 194 127 84 7 209 212 96 2 108 68 24 45 71 210 2 8 1 200 7 219 1 8 198 194 198 2 1 1 1 56 2 53 50 1 171 70 2 92 2 157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. */ if (rt_task(current)) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, busy_flag); mask = vfs_poll(f.file, wait); fdput(f); } if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask = 0, filter; struct fd f; if (fd < 0) goto out; mask = EPOLLNVAL; f = fdget(fd); if (!f.file) goto out; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(f.file, pwait); if (mask & busy_flag) *can_busy_poll = true; mask &= filter; /* Mask out unneeded events. */ fdput(f); out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Deflate algorithm (RFC 1951), implemented here primarily for use * by IPCOMP (RFC 3173 & RFC 2394). * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * * FIXME: deflate transforms will require up to a total of about 436k of kernel * memory on i386 (390k for compression, the rest for decompression), as the * current zlib kernel code uses a worst case pre-allocation system by default. * This needs to be fixed so that the amount of memory required is properly * related to the winbits and memlevel parameters. * * The default winbits of 11 should suit most packets, and it may be something * to configure on a per-tfm basis in the future. * * Currently, compression history is not maintained between tfm calls, as * it is not needed for IPCOMP and keeps the code simpler. It can be * implemented if someone wants it. */ #include <linux/init.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/zlib.h> #include <linux/vmalloc.h> #include <linux/interrupt.h> #include <linux/mm.h> #include <linux/net.h> #include <crypto/internal/scompress.h> #define DEFLATE_DEF_LEVEL Z_DEFAULT_COMPRESSION #define DEFLATE_DEF_WINBITS 11 #define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL struct deflate_ctx { struct z_stream_s comp_stream; struct z_stream_s decomp_stream; }; static int deflate_comp_init(struct deflate_ctx *ctx) { int ret = 0; struct z_stream_s *stream = &ctx->comp_stream; stream->workspace = vzalloc(zlib_deflate_workspacesize( -DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL)); if (!stream->workspace) { ret = -ENOMEM; goto out; } ret = zlib_deflateInit2(stream, DEFLATE_DEF_LEVEL, Z_DEFLATED, -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL, Z_DEFAULT_STRATEGY); if (ret != Z_OK) { ret = -EINVAL; goto out_free; } out: return ret; out_free: vfree(stream->workspace); goto out; } static int deflate_decomp_init(struct deflate_ctx *ctx) { int ret = 0; struct z_stream_s *stream = &ctx->decomp_stream; stream->workspace = vzalloc(zlib_inflate_workspacesize()); if (!stream->workspace) { ret = -ENOMEM; goto out; } ret = zlib_inflateInit2(stream, -DEFLATE_DEF_WINBITS); if (ret != Z_OK) { ret = -EINVAL; goto out_free; } out: return ret; out_free: vfree(stream->workspace); goto out; } static void deflate_comp_exit(struct deflate_ctx *ctx) { zlib_deflateEnd(&ctx->comp_stream); vfree(ctx->comp_stream.workspace); } static void deflate_decomp_exit(struct deflate_ctx *ctx) { zlib_inflateEnd(&ctx->decomp_stream); vfree(ctx->decomp_stream.workspace); } static int __deflate_init(void *ctx) { int ret; ret = deflate_comp_init(ctx); if (ret) goto out; ret = deflate_decomp_init(ctx); if (ret) deflate_comp_exit(ctx); out: return ret; } static void *deflate_alloc_ctx(struct crypto_scomp *tfm) { struct deflate_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ret = __deflate_init(ctx); if (ret) { kfree(ctx); return ERR_PTR(ret); } return ctx; } static int deflate_init(struct crypto_tfm *tfm) { struct deflate_ctx *ctx = crypto_tfm_ctx(tfm); return __deflate_init(ctx); } static void __deflate_exit(void *ctx) { deflate_comp_exit(ctx); deflate_decomp_exit(ctx); } static void deflate_free_ctx(struct crypto_scomp *tfm, void *ctx) { __deflate_exit(ctx); kfree_sensitive(ctx); } static void deflate_exit(struct crypto_tfm *tfm) { struct deflate_ctx *ctx = crypto_tfm_ctx(tfm); __deflate_exit(ctx); } static int __deflate_compress(const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen, void *ctx) { int ret = 0; struct deflate_ctx *dctx = ctx; struct z_stream_s *stream = &dctx->comp_stream; ret = zlib_deflateReset(stream); if (ret != Z_OK) { ret = -EINVAL; goto out; } stream->next_in = (u8 *)src; stream->avail_in = slen; stream->next_out = (u8 *)dst; stream->avail_out = *dlen; ret = zlib_deflate(stream, Z_FINISH); if (ret != Z_STREAM_END) { ret = -EINVAL; goto out; } ret = 0; *dlen = stream->total_out; out: return ret; } static int deflate_compress(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen) { struct deflate_ctx *dctx = crypto_tfm_ctx(tfm); return __deflate_compress(src, slen, dst, dlen, dctx); } static int deflate_scompress(struct crypto_scomp *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen, void *ctx) { return __deflate_compress(src, slen, dst, dlen, ctx); } static int __deflate_decompress(const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen, void *ctx) { int ret = 0; struct deflate_ctx *dctx = ctx; struct z_stream_s *stream = &dctx->decomp_stream; ret = zlib_inflateReset(stream); if (ret != Z_OK) { ret = -EINVAL; goto out; } stream->next_in = (u8 *)src; stream->avail_in = slen; stream->next_out = (u8 *)dst; stream->avail_out = *dlen; ret = zlib_inflate(stream, Z_SYNC_FLUSH); /* * Work around a bug in zlib, which sometimes wants to taste an extra * byte when being used in the (undocumented) raw deflate mode. * (From USAGI). */ if (ret == Z_OK && !stream->avail_in && stream->avail_out) { u8 zerostuff = 0; stream->next_in = &zerostuff; stream->avail_in = 1; ret = zlib_inflate(stream, Z_FINISH); } if (ret != Z_STREAM_END) { ret = -EINVAL; goto out; } ret = 0; *dlen = stream->total_out; out: return ret; } static int deflate_decompress(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen) { struct deflate_ctx *dctx = crypto_tfm_ctx(tfm); return __deflate_decompress(src, slen, dst, dlen, dctx); } static int deflate_sdecompress(struct crypto_scomp *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen, void *ctx) { return __deflate_decompress(src, slen, dst, dlen, ctx); } static struct crypto_alg alg = { .cra_name = "deflate", .cra_driver_name = "deflate-generic", .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, .cra_ctxsize = sizeof(struct deflate_ctx), .cra_module = THIS_MODULE, .cra_init = deflate_init, .cra_exit = deflate_exit, .cra_u = { .compress = { .coa_compress = deflate_compress, .coa_decompress = deflate_decompress } } }; static struct scomp_alg scomp = { .alloc_ctx = deflate_alloc_ctx, .free_ctx = deflate_free_ctx, .compress = deflate_scompress, .decompress = deflate_sdecompress, .base = { .cra_name = "deflate", .cra_driver_name = "deflate-scomp", .cra_module = THIS_MODULE, } }; static int __init deflate_mod_init(void) { int ret; ret = crypto_register_alg(&alg); if (ret) return ret; ret = crypto_register_scomp(&scomp); if (ret) { crypto_unregister_alg(&alg); return ret; } return ret; } static void __exit deflate_mod_fini(void) { crypto_unregister_alg(&alg); crypto_unregister_scomp(&scomp); } subsys_initcall(deflate_mod_init); module_exit(deflate_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_ALIAS_CRYPTO("deflate");
799 3 792 796 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic timeout handling of requests. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fault-inject.h> #include "blk.h" #include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT static DECLARE_FAULT_ATTR(fail_io_timeout); static int __init setup_fail_io_timeout(char *str) { return setup_fault_attr(&fail_io_timeout, str); } __setup("fail_io_timeout=", setup_fail_io_timeout); bool __blk_should_fake_timeout(struct request_queue *q) { return should_fail(&fail_io_timeout, 1); } EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", NULL, &fail_io_timeout); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); return sprintf(buf, "%d\n", set != 0); } ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); int val; if (count) { struct request_queue *q = disk->queue; char *p = (char *) buf; val = simple_strtoul(p, &p, 10); if (val) blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); else blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); } return count; } #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { /* * All we need to ensure is that timeout scan takes place * immediately and that scan sees the new timeout value. * No need for fancy synchronizations. */ WRITE_ONCE(req->deadline, jiffies); kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); static unsigned long blk_timeout_mask __read_mostly; static int __init blk_timeout_init(void) { blk_timeout_mask = roundup_pow_of_two(HZ) - 1; return 0; } late_initcall(blk_timeout_init); /* * Just a rough estimate, we don't care about specific values for timeouts. */ static inline unsigned long blk_round_jiffies(unsigned long j) { return (j + blk_timeout_mask) + 1; } unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; return timeout; } /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. * * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. */ if (!req->timeout) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; expiry = jiffies + req->timeout; WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { unsigned long diff = q->timeout.expires - expiry; /* * Due to added timer slack to group timers, the timer * will often be a little in front of what we asked for. * So apply some tolerance here too, otherwise we keep * modifying the timer because expires for value X * will be X + something. */ if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) mod_timer(&q->timeout, expiry); } }
827 5399 21 21 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0 /* * IA-32 Huge TLB Page Support for Kernel. * * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/elf.h> /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pmd_huge(pmd_t pmd) { return !pmd_none(pmd) && (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } /* * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pud_huge(pud_t pud) { #if CONFIG_PGTABLE_LEVELS > 2 return !pud_none(pud) && (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; #else return 0; #endif } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ info.high_limit = in_32bit_syscall() ? task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (addr & ~PAGE_MASK) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } return addr; } unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { addr &= huge_page_mask(h); if (!mmap_address_hint_valid(addr, len)) goto get_unmapped_area; vma = find_vma(mm, addr); if (!vma || addr + len <= vm_start_gap(vma)) return addr; } get_unmapped_area: if (mm->get_unmapped_area == arch_get_unmapped_area) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) { if (size == PMD_SIZE) return true; else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) return true; else return false; } #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } arch_initcall(gigantic_pages_init); #endif #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { struct tipc_msg *hdr; struct sk_buff *skb; skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); msg_set_sugg_node_addr(hdr, sugg_addr); msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** * disc_dupl_alert - issue node address duplication alert * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char media_addr_str[64]; tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer * Returns true if message should be dropped by caller, i.e., if it is a * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, struct tipc_bearer *b, u32 dst, u32 src, u32 sugg_addr, u8 *peer_id, int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); u32 self = tipc_own_addr(net); bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) return true; /* Ignore if somebody else already gave new suggestion */ if (dst != tn->trial_addr) return true; /* Otherwise update trial address and restart trial period */ tn->trial_addr = sugg_addr; msg_set_prevnode(buf_msg(d->skb), sugg_addr); tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); return true; } /* Apply trial address if we just left trial period */ if (!trial && !self) { schedule_work(&tn->work); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, self, sugg_addr, maddr, b); return true; } /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace * @skb: buffer containing message * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; u32 self; int err; if (skb_linearize(skb)) { kfree_skb(skb); return; } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); else sprintf(peer_id, "%x", src); err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; } /* Ignore discovery messages from own node */ if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; if (net_id != tn->net_id) return; if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, src, sugg, peer_id, mtyp)) return; self = tipc_own_addr(net); /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } if (!tipc_in_scope(legacy, dst, self)) return; if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) return; if (mtyp != DSC_REQ_MSG) return; tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes */ void tipc_disc_add_dest(struct tipc_discoverer *d) { spin_lock_bh(&d->lock); d->num_nodes++; spin_unlock_bh(&d->lock); } /* tipc_disc_remove_dest - decrement set of discovered nodes */ void tipc_disc_remove_dest(struct tipc_discoverer *d) { int intv, num; spin_lock_bh(&d->lock); d->num_nodes--; num = d->num_nodes; intv = d->timer_intv; if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { d->timer_intv = TIPC_DISC_INIT; mod_timer(&d->timer, jiffies + d->timer_intv); } spin_unlock_bh(&d->lock); } /* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. * - Keep doubling time between sent request until limit is reached; * - Hold at fast polling rate if we don't have any associated nodes * - Otherwise hold at slow polling rate */ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ if (tipc_node(d->domain) && d->num_nodes) { d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } /* Did we just leave trial period ? */ if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); spin_unlock_bh(&d->lock); schedule_work(&tn->work); return; } /* Adjust timeout interval according to discovery phase */ if (time_before(jiffies, tn->addr_trial_end)) { d->timer_intv = TIPC_DISC_INIT; } else { d->timer_intv *= 2; if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @skb: pointer to created frame * * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); /* Do we need an address trial period first ? */ if (!tipc_own_addr(net)) { tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; spin_lock_init(&d->lock); timer_setup(&d->timer, tipc_disc_timeout, 0); mod_timer(&d->timer, jiffies + d->timer_intv); b->disc = d; *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { timer_shutdown_sync(&d->timer); kfree_skb(d->skb); kfree(d); } /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { struct tipc_discoverer *d = b->disc; struct tipc_media_addr maddr; struct sk_buff *skb; spin_lock_bh(&d->lock); tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; memcpy(&maddr, &d->dest, sizeof(maddr)); mod_timer(&d->timer, jiffies + d->timer_intv); skb = skb_clone(d->skb, GFP_ATOMIC); spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); }
3 3 3 3 3 57 57 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Authors: Sjur Brendeland * Daniel Martensson */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/sched.h> #include <linux/sockios.h> #include <linux/caif/if_caif.h> #include <net/rtnetlink.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> /* GPRS PDP connection has MTU to 1500 */ #define GPRS_PDP_MTU 1500 /* 5 sec. connect timeout */ #define CONNECT_TIMEOUT (5 * HZ) #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #define UNDEF_CONNID 0xffffffff /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("caif"); enum caif_states { CAIF_CONNECTED = 1, CAIF_CONNECTING, CAIF_DISCONNECTED, CAIF_SHUTDOWN }; struct chnl_net { struct cflayer chnl; struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; char name[256]; wait_queue_head_t netmgmt_wq; /* Flow status to remember and control the transmission. */ bool flowenabled; enum caif_states state; }; static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; struct chnl_net *priv; int pktlen; const u8 *ip_version; u8 buf; priv = container_of(layr, struct chnl_net, chnl); skb = (struct sk_buff *) cfpkt_tonative(pkt); /* Get length of CAIF packet. */ pktlen = skb->len; /* Pass some minimum information and * send the packet to the net stack. */ skb->dev = priv->netdev; /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); if (!ip_version) { kfree_skb(skb); return -EINVAL; } switch (*ip_version >> 4) { case 4: skb->protocol = htons(ETH_P_IP); break; case 6: skb->protocol = htons(ETH_P_IPV6); break; default: kfree_skb(skb); priv->netdev->stats.rx_errors++; return -EINVAL; } /* If we change the header in loop mode, the checksum is corrupted. */ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb->ip_summed = CHECKSUM_NONE; netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; priv->netdev->stats.rx_bytes += pktlen; return 0; } static int delete_device(struct chnl_net *dev) { ASSERT_RTNL(); if (dev->netdev) unregister_netdevice(dev->netdev); return 0; } static void close_work(struct work_struct *work) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); if (dev->state == CAIF_SHUTDOWN) dev_close(dev->netdev); } rtnl_unlock(); } static DECLARE_WORK(close_worker, close_work); static void chnl_hold(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_hold(priv->netdev); } static void chnl_put(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_put(priv->netdev); } static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" : flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" : flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ? "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND"); switch (flow) { case CAIF_CTRLCMD_FLOW_OFF_IND: priv->flowenabled = false; netif_stop_queue(priv->netdev); break; case CAIF_CTRLCMD_DEINIT_RSP: priv->state = CAIF_DISCONNECTED; break; case CAIF_CTRLCMD_INIT_FAIL_RSP: priv->state = CAIF_DISCONNECTED; wake_up_interruptible(&priv->netmgmt_wq); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: priv->state = CAIF_SHUTDOWN; netif_tx_disable(priv->netdev); schedule_work(&close_worker); break; case CAIF_CTRLCMD_FLOW_ON_IND: priv->flowenabled = true; netif_wake_queue(priv->netdev); break; case CAIF_CTRLCMD_INIT_RSP: caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put); priv->state = CAIF_CONNECTED; priv->flowenabled = true; netif_wake_queue(priv->netdev); wake_up_interruptible(&priv->netmgmt_wq); break; default: break; } } static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; int len; int result = -1; /* Get our private data. */ priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { pr_warn("Size of skb exceeded MTU\n"); kfree_skb(skb); dev->stats.tx_errors++; return NETDEV_TX_OK; } if (!priv->flowenabled) { pr_debug("dropping packets flow off\n"); kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* Store original SKB length. */ len = skb->len; pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb); /* Send the packet down the stack. */ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); if (result) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } /* Update statistics. */ dev->stats.tx_packets++; dev->stats.tx_bytes += len; return NETDEV_TX_OK; } static int chnl_net_open(struct net_device *dev) { struct chnl_net *priv = NULL; int result = -1; int llifindex, headroom, tailroom, mtu; struct net_device *lldev; ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } if (priv->state != CAIF_CONNECTING) { priv->state = CAIF_CONNECTING; result = caif_connect_client(dev_net(dev), &priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { pr_debug("err: " "Unable to register and open device," " Err:%d\n", result); goto error; } lldev = __dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { pr_debug("no interface?\n"); result = -ENODEV; goto error; } dev->needed_tailroom = tailroom + lldev->needed_tailroom; dev->hard_header_len = headroom + lldev->hard_header_len + lldev->needed_tailroom; /* * MTU, head-room etc is not know before we have a * CAIF link layer device available. MTU calculation may * override initial RTNL configuration. * MTU is minimum of current mtu, link layer mtu pluss * CAIF head and tail, and PDP GPRS contexts max MTU. */ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); mtu = min_t(int, GPRS_PDP_MTU, mtu); dev_set_mtu(dev, mtu); if (mtu < 100) { pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } } rtnl_unlock(); /* Release RTNL lock during connect wait */ result = wait_event_interruptible_timeout(priv->netmgmt_wq, priv->state != CAIF_CONNECTING, CONNECT_TIMEOUT); rtnl_lock(); if (result == -ERESTARTSYS) { pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { pr_debug("connect timeout\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(dev_net(dev), &priv->chnl); priv->state = CAIF_DISCONNECTED; pr_debug("state disconnected\n"); return result; } static int chnl_net_stop(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); priv->state = CAIF_DISCONNECTED; caif_disconnect_client(dev_net(dev), &priv->chnl); return 0; } static int chnl_net_init(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); strncpy(priv->name, dev->name, sizeof(priv->name)); INIT_LIST_HEAD(&priv->list_field); return 0; } static void chnl_net_uninit(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { .ndo_open = chnl_net_open, .ndo_stop = chnl_net_stop, .ndo_init = chnl_net_init, .ndo_uninit = chnl_net_uninit, .ndo_start_xmit = chnl_net_start_xmit, }; static void chnl_net_destructor(struct net_device *dev) { struct chnl_net *priv = netdev_priv(dev); caif_free_client(&priv->chnl); } static void ipcaif_net_setup(struct net_device *dev) { struct chnl_net *priv; dev->netdev_ops = &netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = chnl_net_destructor; dev->flags |= IFF_NOARP; dev->flags |= IFF_POINTOPOINT; dev->mtu = GPRS_PDP_MTU; dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN; priv = netdev_priv(dev); priv->chnl.receive = chnl_recv_cb; priv->chnl.ctrlcmd = chnl_flowctrl_cb; priv->netdev = dev; priv->conn_req.protocol = CAIFPROTO_DATAGRAM; priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW; priv->conn_req.priority = CAIF_PRIO_LOW; /* Insert illegal value */ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID; priv->flowenabled = false; init_waitqueue_head(&priv->netmgmt_wq); } static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct chnl_net *priv; u8 loop; priv = netdev_priv(dev); if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id) || nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id)) goto nla_put_failure; loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP; if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]); if (data[IFLA_CAIF_IPV6_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]); if (data[IFLA_CAIF_LOOPBACK]) { if (nla_get_u8(data[IFLA_CAIF_LOOPBACK])) conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP; else conn_req->protocol = CAIFPROTO_DATAGRAM; } } static int ipcaif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { int ret; struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); ret = register_netdevice(dev); if (ret) pr_warn("device rtml registration failed\n"); else list_add(&caifdev->list_field, &chnl_net_list); /* Use ifindex as connection id, and use loopback channel default. */ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) { caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex; caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP; } return ret; } static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); netdev_state_change(dev); return 0; } static size_t ipcaif_get_size(const struct net_device *dev) { return /* IFLA_CAIF_IPV4_CONNID */ nla_total_size(4) + /* IFLA_CAIF_IPV6_CONNID */ nla_total_size(4) + /* IFLA_CAIF_LOOPBACK */ nla_total_size(2) + 0; } static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = { [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 } }; static struct rtnl_link_ops ipcaif_link_ops __read_mostly = { .kind = "caif", .priv_size = sizeof(struct chnl_net), .setup = ipcaif_net_setup, .maxtype = IFLA_CAIF_MAX, .policy = ipcaif_policy, .newlink = ipcaif_newlink, .changelink = ipcaif_changelink, .get_size = ipcaif_get_size, .fill_info = ipcaif_fill_info, }; static int __init chnl_init_module(void) { return rtnl_link_register(&ipcaif_link_ops); } static void __exit chnl_exit_module(void) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_link_unregister(&ipcaif_link_ops); rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); list_del_init(list_node); delete_device(dev); } rtnl_unlock(); } module_init(chnl_init_module); module_exit(chnl_exit_module);
5 507 441 455 379 107 104 69 40 343 278 75 75 586 4 592 89 507 32 32 64 1 63 1 60 5 57 1 10 9 2 2 477 19 18 10 10 9 1 58 46 45 21 3 2 3 19 65 52 2 21 32 2 3 22 1 5 1 16 5 13 2 2 11 1 11 340 543 9 11 1 1 1 51 129 80 82 82 84 61 97 16 27 27 17 16 14 26 26 2 2 145 147 1 1 1 20 18 215 58 60 3 215 200 215 134 1 122 10 128 7 134 129 135 135 1 130 123 16 3 1 131 4 1 1 1 225 222 230 73 85 1 129 127 4 1 204 1 221 218 3 11 11 11 65 164 219 211 202 196 198 61 11 202 204 5 195 205 198 5 64 159 5 1 5 1 1 470 1 1 6 468 1 1 3 481 479 1 1 4 9 13 1 459 178 297 480 486 442 5 1 471 211 1 1 8 5 246 256 261 7 1 1 127 98 2 101 103 98 3 17 1 1 1 1 5 3 2 4 4 6 2 1 5 1 1 3 1 542 1 4 1 15 20 500 450 3 450 1 2 1 12 1 2 1 1 1 1 140 83 2 1 1 6 138 1 138 4 483 2 14 14 10 463 216 268 213 5 4 4 3 4 1 5 5 1 4 3 4 4 4 16 16 13 12 5 22 23 23 5 4 4 2 2 6 5 2 4 1 2 1 1 22 2 19 24 89 1 1 2 2 13 64 1 6 6 41 41 25 61 10 6 4 3 3 2 40 5 18 6 9 4 8 7 2 7 3 15 15 2 4 2 6 2 18 1 2 14 13 8 4 15 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_api.c Packet scheduler API. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <trace/events/qdisc.h> /* Short review. ------------- This file consists of two interrelated parts: 1. queueing disciplines manager frontend. 2. traffic classes manager frontend. Generally, queueing discipline ("qdisc") is a black box, which is able to enqueue packets and to dequeue them (when device is ready to send something) in order and at times determined by algorithm hidden in it. qdisc's are divided to two categories: - "queues", which have no internal structure visible from outside. - "schedulers", which split all the packets to "traffic classes", using "packet classifiers" (look at cls_api.c) In turn, classes may have child qdiscs (as rule, queues) attached to them etc. etc. etc. The goal of the routines in this file is to translate information supplied by user in the form of handles to more intelligible for kernel form, to make some sanity checks and part of work, which is common to all qdiscs and to provide rtnetlink notifications. All real intelligent work is done inside qdisc modules. Every discipline has two major routines: enqueue and dequeue. ---dequeue dequeue usually returns a skb to send. It is allowed to return NULL, but it does not mean that queue is empty, it just means that discipline does not want to send anything this time. Queue is really empty if q->q.qlen == 0. For complicated disciplines with multiple queues q->q is not real packet queue, but however q->q.qlen must be valid. ---enqueue enqueue returns 0, if packet was enqueued successfully. If packet (this one or another one) was dropped, it returns not zero error code. NET_XMIT_DROP - this packet dropped Expected action: do not backoff, but wait until queue will clear. NET_XMIT_CN - probably this packet enqueued, but another one dropped. Expected action: backoff or ignore Auxiliary routines: ---peek like dequeue but without removing a packet from the queue ---reset returns qdisc to initial state: purge all buffers, clear all timers, counters (except for statistics) etc. ---init initializes newly created qdisc. ---destroy destroys resources allocated by init and during lifetime of qdisc. ---change changes qdisc parameters. */ /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(qdisc_mod_lock); /************************************************ * Queueing disciplines manipulation. * ************************************************/ /* The list of all installed queueing disciplines. */ static struct Qdisc_ops *qdisc_base; /* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int rc = -EEXIST; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (!strcmp(qops->id, q->id)) goto out; if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; if (qops->peek == NULL) { if (qops->dequeue == NULL) qops->peek = noop_qdisc_ops.peek; else goto out_einval; } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; if (qops->cl_ops) { const struct Qdisc_class_ops *cops = qops->cl_ops; if (!(cops->find && cops->walk && cops->leaf)) goto out_einval; if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; out_einval: rc = -EINVAL; goto out; } EXPORT_SYMBOL(register_qdisc); void unregister_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int err = -ENOENT; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { *qp = q->next; q->next = NULL; err = 0; } write_unlock(&qdisc_mod_lock); WARN(err, "unregister qdisc(%s) failed\n", qops->id); } EXPORT_SYMBOL(unregister_qdisc); /* Get default qdisc if not otherwise specified */ void qdisc_get_default(char *name, size_t len) { read_lock(&qdisc_mod_lock); strscpy(name, default_qdisc_ops->id, len); read_unlock(&qdisc_mod_lock); } static struct Qdisc_ops *qdisc_lookup_default(const char *name) { struct Qdisc_ops *q = NULL; for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { if (!try_module_get(q->owner)) q = NULL; break; } } return q; } /* Set new default qdisc to use */ int qdisc_set_default(const char *name) { const struct Qdisc_ops *ops; if (!capable(CAP_NET_ADMIN)) return -EPERM; write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); request_module(NET_SCH_ALIAS_PREFIX "%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); } if (ops) { /* Set new default */ module_put(default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); return ops ? 0 : -ENOENT; } #ifdef CONFIG_NET_SCH_DEFAULT /* Set default value from kernel config */ static int __init sch_default_qdisc(void) { return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); } late_initcall(sch_default_qdisc); #endif /* We know handle. Find qdisc among all qdisc's attached to device * (root qdisc, all its children, children of children etc.) * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; if (!qdisc_dev(root)) return (root->handle == handle ? root : NULL); if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, lockdep_rtnl_is_held()) { if (q->handle == handle) return q; } return NULL; } void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) q->flags |= TCQ_F_INVISIBLE; } } EXPORT_SYMBOL(qdisc_hash_add); void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_del_rcu(&q->hash); } } EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; if (dev_ingress_queue(dev)) q = qdisc_match_from_root( rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), handle); out: return q; } struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) { struct netdev_queue *nq; struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; nq = dev_ingress_queue_rcu(dev); if (nq) q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), handle); out: return q; } static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; cl = cops->find(p, classid); if (cl == 0) return NULL; return cops->leaf(p, cl); } /* Find queueing discipline by name */ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; } } read_unlock(&qdisc_mod_lock); } return q; } /* The linklayer setting were not transferred from iproute2, in older * versions, and the rate tables lookup systems have been dropped in * the kernel. To keep backward compatible with older iproute2 tc * utils, we detect the linklayer setting by detecting if the rate * table were modified. * * For linklayer ATM table entries, the rate table will be aligned to * 48 bytes, thus some table entries will contain the same value. The * mpu (min packet unit) is also encoded into the old rate table, thus * starting from the mpu, we find low and high table entries for * mapping this cell. If these entries contain the same value, when * the rate tables have been modified for linklayer ATM. * * This is done by rounding mpu to the nearest 48 bytes cell/entry, * and then roundup to the next cell, calc the table entry one below, * and compare. */ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) { int low = roundup(r->mpu, 48); int high = roundup(low+1, 48); int cell_low = low >> r->cell_log; int cell_high = (high >> r->cell_log) - 1; /* rtab is too inaccurate at rates > 100Mbit/s */ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { pr_debug("TC linklayer: Giving up ATM detection\n"); return TC_LINKLAYER_ETHERNET; } if ((cell_high > cell_low) && (cell_high < 256) && (rtab[cell_low] == rtab[cell_high])) { pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", cell_low, cell_high, rtab[cell_high]); return TC_LINKLAYER_ATM; } return TC_LINKLAYER_ETHERNET; } static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack) { struct qdisc_rate_table *rtab; if (tab == NULL || r->rate == 0 || r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; } for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && !memcmp(&rtab->data, nla_data(tab), 1024)) { rtab->refcnt++; return rtab; } } rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } else { NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); } return rtab; } EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { struct qdisc_rate_table *rtab, **rtabp; if (!tab || --tab->refcnt) return; for (rtabp = &qdisc_rtab_list; (rtab = *rtabp) != NULL; rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); return; } } } EXPORT_SYMBOL(qdisc_put_rtab); static LIST_HEAD(qdisc_stab_list); static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, [TCA_STAB_DATA] = { .type = NLA_BINARY }, }; static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_STAB_MAX + 1]; struct qdisc_size_table *stab; struct tc_sizespec *s; unsigned int tsize = 0; u16 *tab = NULL; int err; err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, extack); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) { NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); return ERR_PTR(-EINVAL); } s = nla_data(tb[TCA_STAB_BASE]); if (s->tsize > 0) { if (!tb[TCA_STAB_DATA]) { NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); return ERR_PTR(-EINVAL); } tab = nla_data(tb[TCA_STAB_DATA]); tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } if (tsize != s->tsize || (!tab && tsize > 0)) { NL_SET_ERR_MSG(extack, "Invalid size of size table"); return ERR_PTR(-EINVAL); } list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; if (tsize > 0 && memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; } if (s->size_log > STAB_SIZE_LOG_MAX || s->cell_log > STAB_SIZE_LOG_MAX) { NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); return ERR_PTR(-EINVAL); } stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); list_add_tail(&stab->list, &qdisc_stab_list); return stab; } void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) return; if (--tab->refcnt == 0) { list_del(&tab->list); kfree_rcu(tab, rcu); } } EXPORT_SYMBOL(qdisc_put_stab); static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; nla_put_failure: return -1; } void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; pkt_len = skb->len + stab->szopts.overhead; if (unlikely(!stab->szopts.tsize)) goto out; slot = pkt_len + stab->szopts.cell_align; if (unlikely(slot < 0)) slot = 0; slot >>= stab->szopts.cell_log; if (likely(slot < stab->szopts.tsize)) pkt_len = stab->data[slot]; else pkt_len = stab->data[stab->szopts.tsize - 1] * (slot / stab->szopts.tsize) + stab->data[slot % stab->szopts.tsize]; pkt_len <<= stab->szopts.size_log; out: if (unlikely(pkt_len < 1)) pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); rcu_read_lock(); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); return HRTIMER_NORESTART; } void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); } EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns) { bool deactivated; rcu_read_lock(); deactivated = test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state); rcu_read_unlock(); if (deactivated) return; if (hrtimer_is_queued(&wd->timer)) { u64 softexpires; softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); /* If timer is already set in [expires, expires + delta_ns], * do not reprogram it. */ if (softexpires - expires <= delta_ns) return; } hrtimer_start_range_ns(&wd->timer, ns_to_ktime(expires), delta_ns, HRTIMER_MODE_ABS_PINNED); } EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); } EXPORT_SYMBOL(qdisc_watchdog_cancel); static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) { struct hlist_head *h; unsigned int i; h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); if (h != NULL) { for (i = 0; i < n; i++) INIT_HLIST_HEAD(&h[i]); } return h; } void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; /* Rehash when load factor exceeds 0.75 */ if (clhash->hashelems * 4 <= clhash->hashsize * 3) return; nsize = clhash->hashsize * 2; nmask = nsize - 1; nhash = qdisc_class_hash_alloc(nsize); if (nhash == NULL) return; ohash = clhash->hash; osize = clhash->hashsize; sch_tree_lock(sch); for (i = 0; i < osize; i++) { hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } } clhash->hash = nhash; clhash->hashsize = nsize; clhash->hashmask = nmask; sch_tree_unlock(sch); kvfree(ohash); } EXPORT_SYMBOL(qdisc_class_hash_grow); int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) { unsigned int size = 4; clhash->hash = qdisc_class_hash_alloc(size); if (!clhash->hash) return -ENOMEM; clhash->hashsize = size; clhash->hashmask = size - 1; clhash->hashelems = 0; return 0; } EXPORT_SYMBOL(qdisc_class_hash_init); void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) { kvfree(clhash->hash); } EXPORT_SYMBOL(qdisc_class_hash_destroy); void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { unsigned int h; INIT_HLIST_NODE(&cl->hnode); h = qdisc_class_hash(cl->classid, clhash->hashmask); hlist_add_head(&cl->hnode, &clhash->hash[h]); clhash->hashelems++; } EXPORT_SYMBOL(qdisc_class_hash_insert); void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { hlist_del(&cl->hnode); clhash->hashelems--; } EXPORT_SYMBOL(qdisc_class_hash_remove); /* Allocate an unique handle from space managed by kernel * Possible range is [8000-FFFF]:0000 (0x8000 values) */ static u32 qdisc_alloc_handle(struct net_device *dev) { int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); if (!qdisc_lookup(dev, autohandle)) return autohandle; cond_resched(); } while (--i > 0); return 0; } void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) break; if (sch->flags & TCQ_F_NOPARENT) break; /* Notify parent qdisc only if child qdisc becomes empty. * * If child was empty even before update then backlog * counter is screwed and we skip notification because * parent class is already passive. * * If the original child was offloaded then it is allowed * to be seem as empty, so the parent is notified anyway. */ notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } sch->q.qlen -= n; sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_reduce_backlog); int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, void *type_data) { struct net_device *dev = qdisc_dev(sch); int err; sch->flags &= ~TCQ_F_OFFLOADED; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return 0; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); if (err == -EOPNOTSUPP) return 0; if (!err) sch->flags |= TCQ_F_OFFLOADED; return err; } EXPORT_SYMBOL(qdisc_offload_dump_helper); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { bool any_qdisc_is_offloaded; int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); /* Don't report error if the graft is part of destroy operation. */ if (!err || !new || new == &noop_qdisc) return; /* Don't report error if the parent, the old child and the new * one are not offloaded. */ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; if (any_qdisc_is_offloaded) NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); } EXPORT_SYMBOL(qdisc_offload_graft_helper); void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_query_caps_base base = { .type = type, .caps = caps, }; memset(caps, 0, caps_len); if (ops->ndo_setup_tc) ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); } EXPORT_SYMBOL(qdisc_offload_query_caps); static void qdisc_offload_graft_root(struct net_device *dev, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct tc_root_qopt_offload graft_offload = { .command = TC_ROOT_GRAFT, .handle = new ? new->handle : 0, .ingress = (new && new->flags & TCQ_F_INGRESS) || (old && old->flags & TCQ_F_INGRESS), }; qdisc_offload_graft_helper(dev, NULL, new, old, TC_SETUP_ROOT_QDISC, &graft_offload, extack); } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; u32 block_index; __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (q->ops->ingress_block_get) { block_index = q->ops->ingress_block_get(q); if (block_index && nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->egress_block_get) { block_index = q->ops->egress_block_get(q); if (block_index && nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) goto nla_put_failure; qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; if (qdisc_is_percpu_stats(q)) { cpu_bstats = q->cpu_bstats; cpu_qstats = q->cpu_qstats; } if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) { if (q->flags & TCQ_F_BUILTIN) return true; if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) return true; return false; } static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *q, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (!tc_qdisc_dump_ignore(q, false)) { if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, RTM_NEWQDISC, extack) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { if (new || old) qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); } static void qdisc_clear_nolock(struct Qdisc *sch) { sch->flags &= ~TCQ_F_NOLOCK; if (!(sch->flags & TCQ_F_CPUSTATS)) return; free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); sch->cpu_bstats = NULL; sch->cpu_qstats = NULL; sch->flags &= ~TCQ_F_CPUSTATS; } /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * * When appropriate send a netlink notification using 'skb' * and "n". * * On success, destroy old qdisc. */ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb, struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct Qdisc *q = old; struct net *net = dev_net(dev); if (parent == NULL) { unsigned int i, num_q, ingress; struct netdev_queue *dev_queue; ingress = 0; num_q = dev->num_tx_queues; if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) { ingress = 1; dev_queue = dev_ingress_queue(dev); if (!dev_queue) { NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); return -ENOENT; } q = rtnl_dereference(dev_queue->qdisc_sleeping); /* This is the counterpart of that qdisc_refcount_inc_nz() call in * __tcf_qdisc_find() for filter requests. */ if (!qdisc_refcount_dec_if_one(q)) { NL_SET_ERR_MSG(extack, "Current ingress or clsact Qdisc has ongoing filter requests"); return -EBUSY; } } if (dev->flags & IFF_UP) dev_deactivate(dev); qdisc_offload_graft_root(dev, new, old, extack); if (new && new->ops->attach && !ingress) goto skip; if (!ingress) { for (i = 0; i < num_q; i++) { dev_queue = netdev_get_tx_queue(dev, i); old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) qdisc_refcount_inc(new); qdisc_put(old); } } else { old = dev_graft_qdisc(dev_queue, NULL); /* {ingress,clsact}_destroy() @old before grafting @new to avoid * unprotected concurrent accesses to net_device::miniq_{in,e}gress * pointer(s) in mini_qdisc_pair_swap(). */ qdisc_notify(net, skb, n, classid, old, new, extack); qdisc_destroy(old); dev_graft_qdisc(dev_queue, new); } skip: if (!ingress) { old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } if (dev->flags & IFF_UP) dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; unsigned long cl; int err; /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; cl = cops->find(parent, classid); if (!cl) { NL_SET_ERR_MSG(extack, "Specified class not found"); return -ENOENT; } if (new && new->ops == &noqueue_qdisc_ops) { NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); return -EINVAL; } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { u32 block_index; if (tca[TCA_INGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); return -EINVAL; } if (!sch->ops->ingress_block_set) { NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->ingress_block_set(sch, block_index); } if (tca[TCA_EGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); return -EINVAL; } if (!sch->ops->egress_block_set) { NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->egress_block_set(sch, block_index); } return 0; } /* Allocate and initialize new qdisc. Parameters are passed via opt. */ static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, u32 parent, u32 handle, struct nlattr **tca, int *errp, struct netlink_ext_ack *extack) { int err; struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. * We replay the request because the device may * go away in the mean time. */ rtnl_unlock(); request_module(NET_SCH_ALIAS_PREFIX "%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { /* We will try again qdisc_lookup_ops, * so don't keep a reference. */ module_put(ops->owner); err = -EAGAIN; goto err_out; } } } #endif err = -ENOENT; if (!ops) { NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; } sch->parent = parent; if (handle == TC_H_INGRESS) { if (!(sch->flags & TCQ_F_INGRESS)) { NL_SET_ERR_MSG(extack, "Specified parent ID is reserved for ingress and clsact Qdiscs"); err = -EINVAL; goto err_out3; } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); if (handle == 0) { NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); err = -ENOSPC; goto err_out3; } } if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; /* This exist to keep backward compatible with a userspace * loophole, what allowed userspace to get IFF_NO_QUEUE * facility on older kernels by setting tx_queue_len=0 (prior * to qdisc init), and then forgot to reinit tx_queue_len * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } err = qdisc_block_indexes_set(sch, tca, extack); if (err) goto err_out3; if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); goto err_out3; } rcu_assign_pointer(sch->stab, stab); } if (ops->init) { err = ops->init(sch, tca[TCA_OPTIONS], extack); if (err != 0) goto err_out4; } if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); goto err_out4; } } qdisc_hash_add(sch, false); trace_qdisc_create(ops, dev, parent); return sch; err_out4: /* Even if ops->init() failed, we call ops->destroy() * like qdisc_create_dflt(). */ if (ops->destroy) ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: module_put(ops->owner); err_out: *errp = err; return NULL; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { if (!sch->ops->change) { NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); return -EINVAL; } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); return -EOPNOTSUPP; } err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); if (err) return err; } if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) return PTR_ERR(stab); } ostab = rtnl_dereference(sch->stab); rcu_assign_pointer(sch->stab, stab); qdisc_put_stab(ostab); if (tca[TCA_RATE]) { /* NB: ignores errors from replace_estimator because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); } out: return 0; } struct check_loop_arg { struct qdisc_walker w; struct Qdisc *p; int depth; }; static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { struct check_loop_arg arg; if (q->ops->cl_ops == NULL) return 0; arg.w.stop = arg.w.skip = arg.w.count = 0; arg.w.fn = check_loop_fn; arg.depth = depth; arg.p = p; q->ops->cl_ops->walk(q, &arg.w); return arg.w.stop ? -ELOOP : 0; } static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); if (leaf) { if (leaf == arg->p || arg->depth > 7) return -ELOOP; return check_loop(leaf, arg->p, arg->depth + 1); } return 0; } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, }; /* * Delete/get qdisc. */ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; } } else { q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); return -ENOENT; } } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (n->nlmsg_type == RTM_DELQDISC) { if (!clid) { NL_SET_ERR_MSG(extack, "Classid cannot be zero"); return -EINVAL; } if (q->handle == 0) { NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); return -ENOENT; } err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); if (err != 0) return err; } else { qdisc_get_notify(net, skb, n, clid, q, NULL); } return 0; } static bool req_create_or_replace(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_REPLACE); } static bool req_create_exclusive(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_EXCL); } static bool req_change(struct nlmsghdr *n) { return (!(n->nlmsg_flags & NLM_F_CREATE) && !(n->nlmsg_flags & NLM_F_REPLACE) && !(n->nlmsg_flags & NLM_F_EXCL)); } /* * Create/change qdisc. */ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; replay: /* Reinit, just in case something touches this. */ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ if (q && q->handle == 0) q = NULL; if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); return -EEXIST; } if (TC_H_MIN(tcm->tcm_handle)) { NL_SET_ERR_MSG(extack, "Invalid minor handle"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (q->flags & TCQ_F_INGRESS) { NL_SET_ERR_MSG(extack, "Cannot regraft ingress or clsact Qdiscs"); return -EINVAL; } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } if (clid == TC_H_INGRESS) { NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); return -EINVAL; } qdisc_refcount_inc(q); goto graft; } else { if (!q) goto create_n_graft; /* This magic test requires explanation. * * We know, that some child q is already * attached to this parent and have choice: * 1) change it or 2) create/graft new one. * If the requested qdisc kind is different * than the existing one, then we choose graft. * If they are the same then this is "change" * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only * if the request is explicitly stating * "please create if it doesn't exist". * * 2. If the request is to exclusive create * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. * This will happen when for example tc * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. * For now we select create/graft. */ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { if (req_create_or_replace(n) || req_create_exclusive(n)) goto create_n_graft; else if (req_change(n)) goto create_n_graft2; } } } } else { if (!tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Handle cannot be zero"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); } /* Change qdisc parameters */ if (!q) { NL_SET_ERR_MSG(extack, "Specified qdisc not found"); return -ENOENT; } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), tcm->tcm_parent, tcm->tcm_parent, tca, &err, extack); } else { NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); err = -ENOENT; } } else { struct netdev_queue *dev_queue; if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) dev_queue = p->ops->cl_ops->select_queue(p, tcm); else if (p) dev_queue = p->dev_queue; else dev_queue = netdev_get_tx_queue(dev, 0); q = qdisc_create(dev, dev_queue, tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } if (q == NULL) { if (err == -EAGAIN) goto replay; return err; } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) qdisc_put(q); return err; } return 0; } static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, bool dump_invisible) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; int b; if (!root) return 0; q = root; if (q_idx < s_q_idx) { q_idx++; } else { if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } /* If dumping singletons, there is no qdisc_dev(root) and the singleton * itself has already been dumped. * * If we've already dumped the top-level (ingress) qdisc above and the global * qdisc hashtable, we don't want to hit it again */ if (!qdisc_dev(root) || !recur) goto out; hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; } if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } out: *q_idx_p = q_idx; return ret; done: ret = -1; goto out; } static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; const struct nlmsghdr *nlh = cb->nlh; struct nlattr *tca[TCA_MAX + 1]; int err; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; idx = 0; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, rtm_tca_policy, cb->extack); if (err < 0) return err; for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) goto cont; if (idx > s_idx) s_q_idx = 0; q_idx = 0; if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, cb, &q_idx, s_q_idx, false, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; cont: idx++; } done: cb->args[0] = idx; cb->args[1] = q_idx; return skb->len; } /************************************************ * Traffic classes manipulation. * ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tclass_get_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct sk_buff *skb; int err = 0; if (!cops->delete) return -EOPNOTSUPP; if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } } else { skb = NULL; } err = cops->delete(q, cl, extack); if (err) { kfree_skb(skb); return err; } err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); return err; } #ifdef CONFIG_NET_CLS struct tcf_bind_args { struct tcf_walker w; unsigned long base; unsigned long cl; u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_bind_args *a = (void *)arg; if (n && tp->ops->bind_class) { struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } struct tc_bind_class_args { struct qdisc_walker w; unsigned long new_cl; u32 portid; u32 clid; }; static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; block = cops->tcf_block(q, cl, NULL); if (!block) return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; arg.classid = a->clid; arg.base = cl; arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } return 0; } static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tc_bind_class_args args = {}; if (!cops->tcf_block) return; args.portid = portid; args.clid = clid; args.new_cl = new_cl; args.w.fn = tc_bind_class_walker; q->ops->cl_ops->walk(q, &args.w); } #else static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { } #endif static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; u32 portid; u32 clid; u32 qid; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. parent == X:0 - parent is root class. parent == X:Y - parent is a node in hierarchy. parent == 0:Y - parent is X:Y, where X:0 is qdisc. handle == 0:0 - generate handle from kernel pool. handle == 0:Y - class is X:Y, where X:0 is qdisc. handle == X:Y - clear. handle == X:0 - root class. */ /* Step 1. Determine qdisc handle X:0 */ portid = tcm->tcm_parent; clid = tcm->tcm_handle; qid = TC_H_MAJ(clid); if (portid != TC_H_ROOT) { u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ if (qid != qid1) return -EINVAL; } else if (qid1) { qid = qid1; } else if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. * * TC_H_MAJ(portid) still may be unspecified, complete it now. */ if (portid) portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ q = qdisc_lookup(dev, qid); if (!q) return -ENOENT; /* An check that it supports classes */ cops = q->ops->cl_ops; if (cops == NULL) return -EINVAL; /* Now try to get class */ if (clid == 0) { if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); if (clid) cl = cops->find(q, clid); if (cl == 0) { err = -ENOENT; if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { case RTM_NEWTCLASS: err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: err = tclass_del_notify(net, cops, skb, n, q, cl, extack); /* Unbind the class with flilters with 0 */ tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: err = tclass_get_notify(net, skb, n, q, cl, extack); goto out; default: err = -EINVAL; goto out; } } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); return -EOPNOTSUPP; } new_cl = cl; err = -EOPNOTSUPP; if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); } out: return err; } struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t) { struct qdisc_dump_args arg; if (tc_qdisc_dump_ignore(q, false) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { (*t_p)++; return 0; } if (*t_p > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); arg.w.fn = qdisc_class_dump; arg.skb = skb; arg.cb = cb; arg.w.stop = 0; arg.w.skip = cb->args[1]; arg.w.count = 0; q->ops->cl_ops->walk(q, &arg.w); cb->args[1] = arg.w.count; if (arg.w.stop) return -1; (*t_p)++; return 0; } static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t, bool recur) { struct Qdisc *q; int b; if (!root) return 0; if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; if (!qdisc_dev(root) || !recur) return 0; if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); if (q && q != root && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } return 0; } static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { struct tcmsg *tcm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; int t, s_t; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return 0; s_t = cb->args[0]; t = 0; if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, tcm, cb, &t, s_t, false) < 0) goto done; done: cb->args[0] = t; dev_put(dev); return skb->len; } #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; e = proc_create_single("psched", 0, net->proc_net, psched_show); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit psched_net_exit(struct net *net) { remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) { return 0; } static void __net_exit psched_net_exit(struct net *net) { } #endif static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, }; #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif static int __init pktsched_init(void) { int err; err = register_pernet_subsys(&psched_net_ops); if (err) { pr_err("pktsched_init: " "cannot initialize per netns operations\n"); return err; } register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, 0); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 0); tc_wrapper_init(); return 0; } subsys_initcall(pktsched_init);
43 44 44 43 34 52 53 22 14 4 33 48 47 143 1 147 17 2 48 86 14 15 15 8 8 1 1 2 2 2 2 37 12 4 5 5 2 3 2 3 6 2 17 2 9 6 1 6 1 6 1 5 2 5 2 6 1 1 5 4 1 1 2 4 1 2 2 5 1 3 23 2 11 8 2 9 1 9 1 8 2 8 2 9 1 3 3 3 3 93 60 42 27 44 80 3 62 72 73 72 41 24 17 33 7 28 13 35 2 4 31 8 41 36 5 31 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/static_key.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/dst_metadata.h> #include <net/geneve.h> #include <net/vxlan.h> #include <net/erspan.h> const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; EXPORT_SYMBOL(iptun_encaps); const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; EXPORT_SYMBOL(ip6tun_encaps); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; struct iphdr *iph; int err; skb_scrub_packet(skb, xnet); skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out(net, sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = 0; iptunnel_xmit_stats(dev, pkt_len); } } EXPORT_SYMBOL_GPL(iptunnel_xmit); int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; skb_pull_rcsum(skb, hdr_len); if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) return -ENOMEM; eh = (struct ethhdr *)skb->data; if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); } else { skb->protocol = inner_proto; } skb_clear_hash_if_not_l4(skb); __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); return iptunnel_pull_offloads(skb); } EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) { struct metadata_dst *res; struct ip_tunnel_info *dst, *src; if (!md || md->type != METADATA_IP_TUNNEL || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) return NULL; src = &md->u.tun_info; res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags); if (!res) return NULL; dst = &res->u.tun_info; dst->key.tun_id = src->key.tun_id; if (src->mode & IP_TUNNEL_INFO_IPV6) memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), src->options_len, 0); return res; } EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask) { int err; if (likely(!skb->encapsulation)) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } if (skb_is_gso(skb)) { err = skb_header_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type |= gso_type_mask; return 0; } if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; /* We clear encapsulation here to prevent badly-written * drivers potentially deciding to offload an inner checksum * if we set CHECKSUM_PARTIAL on the outer header. * This should go away when the drivers are all fixed. */ skb->encapsulation = 0; } return 0; } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); /** * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD * @skb: Original packet with L2 header * @mtu: MTU value for ICMP error * * Return: length on success, negative error code if message couldn't be built. */ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) { const struct iphdr *iph = ip_hdr(skb); struct icmphdr *icmph; struct iphdr *niph; struct ethhdr eh; int len, err; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); if (err) return err; len = skb->len + sizeof(*icmph); err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); if (err) return err; icmph = skb_push(skb, sizeof(*icmph)); *icmph = (struct icmphdr) { .type = ICMP_DEST_UNREACH, .code = ICMP_FRAG_NEEDED, .checksum = 0, .un.frag.__unused = 0, .un.frag.mtu = htons(mtu), }; icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0)); skb_reset_transport_header(skb); niph = skb_push(skb, sizeof(*niph)); *niph = (struct iphdr) { .ihl = sizeof(*niph) / 4u, .version = 4, .tos = 0, .tot_len = htons(len + sizeof(*niph)), .id = 0, .frag_off = htons(IP_DF), .ttl = iph->ttl, .protocol = IPPROTO_ICMP, .saddr = iph->daddr, .daddr = iph->saddr, }; ip_send_check(niph); skb_reset_network_header(skb); skb->ip_summed = CHECKSUM_NONE; eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; } /** * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed * @skb: Buffer being sent by encapsulation, L2 headers expected * @mtu: Network MTU for path * * Return: 0 for no ICMP reply, length if built, negative value on error. */ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) { const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) || ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) return 0; if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) return 0; return iptunnel_pmtud_build_icmp(skb, mtu); } #if IS_ENABLED(CONFIG_IPV6) /** * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD * @skb: Original packet with L2 header * @mtu: MTU value for ICMPv6 error * * Return: length on success, negative error code if message couldn't be built. */ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct icmp6hdr *icmp6h; struct ipv6hdr *nip6h; struct ethhdr eh; int len, err; __wsum csum; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); if (err) return err; len = skb->len + sizeof(*icmp6h); err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN); if (err) return err; icmp6h = skb_push(skb, sizeof(*icmp6h)); *icmp6h = (struct icmp6hdr) { .icmp6_type = ICMPV6_PKT_TOOBIG, .icmp6_code = 0, .icmp6_cksum = 0, .icmp6_mtu = htonl(mtu), }; skb_reset_transport_header(skb); nip6h = skb_push(skb, sizeof(*nip6h)); *nip6h = (struct ipv6hdr) { .priority = 0, .version = 6, .flow_lbl = { 0 }, .payload_len = htons(len), .nexthdr = IPPROTO_ICMPV6, .hop_limit = ip6h->hop_limit, .saddr = ip6h->daddr, .daddr = ip6h->saddr, }; skb_reset_network_header(skb); csum = skb_checksum(skb, skb_transport_offset(skb), len, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, IPPROTO_ICMPV6, csum); skb->ip_summed = CHECKSUM_NONE; eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0); skb_reset_mac_header(skb); return skb->len; } /** * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed * @skb: Buffer being sent by encapsulation, L2 headers expected * @mtu: Network MTU for path * * Return: 0 for no ICMPv6 reply, length if built, negative value on error. */ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int stype = ipv6_addr_type(&ip6h->saddr); u8 proto = ip6h->nexthdr; __be16 frag_off; int offset; if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || stype == IPV6_ADDR_LOOPBACK) return 0; offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag_off); if (offset < 0 || (frag_off & htons(~0x7))) return 0; if (proto == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6h; if (!pskb_may_pull(skb, skb_network_header(skb) + offset + 1 - skb->data)) return 0; icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset); if (icmpv6_is_err(icmp6h->icmp6_type) || icmp6h->icmp6_type == NDISC_REDIRECT) return 0; } return iptunnel_pmtud_build_icmpv6(skb, mtu); } #endif /* IS_ENABLED(CONFIG_IPV6) */ /** * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed * @skb: Buffer being sent by encapsulation, L2 headers expected * @encap_dst: Destination for tunnel encapsulation (outer IP) * @headroom: Encapsulation header size, bytes * @reply: Build matching ICMP or ICMPv6 message as a result * * L2 tunnel implementations that can carry IP and can be directly bridged * (currently UDP tunnels) can't always rely on IP forwarding paths to handle * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built * based on payload and sent back by the encapsulation itself. * * For routable interfaces, we just need to update the PMTU for the destination. * * Return: 0 if ICMP error not needed, length if built, negative value on error */ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply) { u32 mtu = dst_mtu(encap_dst) - headroom; if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu)) return 0; skb_dst_update_pmtu_no_confirm(skb, mtu); if (!reply || skb->pkt_type == PACKET_HOST) return 0; if (skb->protocol == htons(ETH_P_IP)) return iptunnel_pmtud_check_icmp(skb, mtu); #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) return iptunnel_pmtud_check_icmpv6(skb, mtu); #endif return 0; } EXPORT_SYMBOL(skb_tunnel_check_pmtu); static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS }, [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED }, }; static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = { [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED }, [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED }, [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, }; static const struct nla_policy vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = { [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; static const struct nla_policy erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = { [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; static int ip_tun_parse_opts_geneve(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; int data_len, err; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr, geneve_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] || !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] || !tb[LWTUNNEL_IP_OPT_GENEVE_DATA]) return -EINVAL; attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA]; data_len = nla_len(attr); if (data_len % 4) return -EINVAL; if (info) { struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len; memcpy(opt->opt_data, nla_data(attr), data_len); opt->length = data_len / 4; attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]; opt->opt_class = nla_get_be16(attr); attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; opt->type = nla_get_u8(attr); info->key.tun_flags |= TUNNEL_GENEVE_OPT; } return sizeof(struct geneve_opt) + data_len; } static int ip_tun_parse_opts_vxlan(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; int err; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr, vxlan_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP]) return -EINVAL; if (info) { struct vxlan_metadata *md = ip_tunnel_info_opts(info) + opts_len; attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); md->gbp &= VXLAN_GBP_MASK; info->key.tun_flags |= TUNNEL_VXLAN_OPT; } return sizeof(struct vxlan_metadata); } static int ip_tun_parse_opts_erspan(struct nlattr *attr, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; int err; u8 ver; err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr, erspan_opt_policy, extack); if (err) return err; if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER]) return -EINVAL; ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); if (ver == 1) { if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) return -EINVAL; } else if (ver == 2) { if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] || !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) return -EINVAL; } else { return -EINVAL; } if (info) { struct erspan_metadata *md = ip_tunnel_info_opts(info) + opts_len; md->version = ver; if (ver == 1) { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]; md->u.index = nla_get_be32(attr); } else { attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(attr); attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(attr)); } info->key.tun_flags |= TUNNEL_ERSPAN_OPT; } return sizeof(struct erspan_metadata); } static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { int err, rem, opt_len, opts_len = 0; struct nlattr *nla; __be16 type = 0; if (!attr) return 0; err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX, ip_opts_policy, extack); if (err) return err; nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case LWTUNNEL_IP_OPTS_GENEVE: if (type && type != TUNNEL_GENEVE_OPT) return -EINVAL; opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) return -EINVAL; type = TUNNEL_GENEVE_OPT; break; case LWTUNNEL_IP_OPTS_VXLAN: if (type) return -EINVAL; opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = TUNNEL_VXLAN_OPT; break; case LWTUNNEL_IP_OPTS_ERSPAN: if (type) return -EINVAL; opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = TUNNEL_ERSPAN_OPT; break; default: return -EINVAL; } } return opts_len; } static int ip_tun_get_optlen(struct nlattr *attr, struct netlink_ext_ack *extack) { return ip_tun_parse_opts(attr, NULL, extack); } static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, struct netlink_ext_ack *extack) { return ip_tun_parse_opts(attr, info, extack); } static int ip_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; struct lwtunnel_state *new_state; struct ip_tunnel_info *tun_info; int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, extack); if (err < 0) return err; opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack); if (opt_len < 0) return opt_len; new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; new_state->type = LWTUNNEL_ENCAP_IP; tun_info = lwt_tun_info(new_state); err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack); if (err < 0) { lwtstate_free(new_state); return err; } #ifdef CONFIG_DST_CACHE err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); if (err) { lwtstate_free(new_state); return err; } #endif if (tb[LWTUNNEL_IP_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags |= (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = opt_len; *ts = new_state; return 0; } static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) { #ifdef CONFIG_DST_CACHE struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); dst_cache_destroy(&tun_info->dst_cache); #endif } static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct geneve_opt *opt; struct nlattr *nest; int offset = 0; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE); if (!nest) return -ENOMEM; while (tun_info->options_len > offset) { opt = ip_tunnel_info_opts(tun_info) + offset; if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) || nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4, opt->opt_data)) { nla_nest_cancel(skb, nest); return -ENOMEM; } offset += sizeof(*opt) + opt->length * 4; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct vxlan_metadata *md; struct nlattr *nest; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN); if (!nest) return -ENOMEM; md = ip_tunnel_info_opts(tun_info); if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) { nla_nest_cancel(skb, nest); return -ENOMEM; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct erspan_metadata *md; struct nlattr *nest; nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN); if (!nest) return -ENOMEM; md = ip_tunnel_info_opts(tun_info); if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version)) goto err; if (md->version == 1 && nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index)) goto err; if (md->version == 2 && (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) || nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID, get_hwid(&md->u.md2)))) goto err; nla_nest_end(skb, nest); return 0; err: nla_nest_cancel(skb, nest); return -ENOMEM; } static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, struct ip_tunnel_info *tun_info) { struct nlattr *nest; int err = 0; if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) return 0; nest = nla_nest_start_noflag(skb, type); if (!nest) return -ENOMEM; if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) err = ip_tun_fill_encap_opts_geneve(skb, tun_info); else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) err = ip_tun_fill_encap_opts_erspan(skb, tun_info); if (err) { nla_nest_cancel(skb, nest); return err; } nla_nest_end(skb, nest); return 0; } static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id, LWTUNNEL_IP_PAD) || nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; return 0; } static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) { int opt_len; if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) return 0; opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { struct geneve_opt *opt; int offset = 0; opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */ while (info->options_len > offset) { opt = ip_tunnel_info_opts(info) + offset; opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */ + nla_total_size(1) /* OPT_GENEVE_TYPE */ + nla_total_size(opt->length * 4); /* OPT_GENEVE_DATA */ offset += sizeof(*opt) + opt->length * 4; } } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { struct erspan_metadata *md = ip_tunnel_info_opts(info); opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ + nla_total_size(1) /* OPT_ERSPAN_VER */ + (md->version == 1 ? nla_total_size(4) /* OPT_ERSPAN_INDEX (v1) */ : nla_total_size(1) + nla_total_size(1)); /* OPT_ERSPAN_DIR + HWID (v2) */ } return opt_len; } static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ + nla_total_size(4) /* LWTUNNEL_IP_DST */ + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); /* LWTUNNEL_IP_OPTS */ } static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct ip_tunnel_info *info_a = lwt_tun_info(a); struct ip_tunnel_info *info_b = lwt_tun_info(b); return memcmp(info_a, info_b, sizeof(info_a->key)) || info_a->mode != info_b->mode || info_a->options_len != info_b->options_len || memcmp(ip_tunnel_info_opts(info_a), ip_tunnel_info_opts(info_b), info_a->options_len); } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .build_state = ip_tun_build_state, .destroy_state = ip_tun_destroy_state, .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, .owner = THIS_MODULE, }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS }, [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; static int ip6_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; struct lwtunnel_state *new_state; struct ip_tunnel_info *tun_info; int err, opt_len; err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, extack); if (err < 0) return err; opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack); if (opt_len < 0) return opt_len; new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len); if (!new_state) return -ENOMEM; new_state->type = LWTUNNEL_ENCAP_IP6; tun_info = lwt_tun_info(new_state); err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack); if (err < 0) { lwtstate_free(new_state); return err; } if (tb[LWTUNNEL_IP6_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); if (tb[LWTUNNEL_IP6_DST]) tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); if (tb[LWTUNNEL_IP6_SRC]) tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); if (tb[LWTUNNEL_IP6_HOPLIMIT]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags |= (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & ~TUNNEL_OPTIONS_PRESENT); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = opt_len; *ts = new_state; return 0; } static int ip6_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id, LWTUNNEL_IP6_PAD) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; return 0; } static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */ + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate)); /* LWTUNNEL_IP6_OPTS */ } static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { .build_state = ip6_tun_build_state, .fill_encap = ip6_tun_fill_encap_info, .get_encap_size = ip6_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, .owner = THIS_MODULE, }; void __init ip_tunnel_core_init(void) { /* If you land here, make sure whether increasing ip_tunnel_info's * options_len is a reasonable choice with its usage in front ends * (f.e., it's part of flow keys, etc). */ BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); EXPORT_SYMBOL(ip_tunnel_metadata_cnt); void ip_tunnel_need_metadata(void) { static_branch_inc(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); void ip_tunnel_unneed_metadata(void) { static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); /* Returns either the correct skb->protocol value, or 0 if invalid. */ __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb) { if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && ip_hdr(skb)->version == 4) return htons(ETH_P_IP); if (skb_network_header(skb) >= skb->head && (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && ipv6_hdr(skb)->version == 6) return htons(ETH_P_IPV6); return 0; } EXPORT_SYMBOL(ip_tunnel_parse_protocol); const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; EXPORT_SYMBOL(ip_tunnel_header_ops); /* This function returns true when ENCAP attributes are present in the nl msg */ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap) { bool ret = false; memset(encap, 0, sizeof(*encap)); if (!data) return ret; if (data[IFLA_IPTUN_ENCAP_TYPE]) { ret = true; encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); } if (data[IFLA_IPTUN_ENCAP_FLAGS]) { ret = true; encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); } if (data[IFLA_IPTUN_ENCAP_SPORT]) { ret = true; encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); } if (data[IFLA_IPTUN_ENCAP_DPORT]) { ret = true; encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); } return ret; } EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) { parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); if (parms->iph.ttl) parms->iph.frag_off = htons(IP_DF); } if (data[IFLA_IPTUN_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); if (data[IFLA_IPTUN_FLAGS]) parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); } EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
74 78 4 72 64 7 2 3 2 48 20 65 3 64 2 2 59 8 35 34 3 31 30 31 12 2 10 10 1 9 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 // SPDX-License-Identifier: GPL-2.0-or-later /* * GRE over IPv4 demultiplexer driver * * Authors: Dmitry Kozlov (xeb@mail.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/if.h> #include <linux/icmp.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> #include <net/erspan.h> #include <net/icmp.h> #include <net/route.h> #include <net/xfrm.h> static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { if (version >= GREPROTO_MAX) return -EINVAL; return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(gre_add_protocol); int gre_del_protocol(const struct gre_protocol *proto, u8 version) { int ret; if (version >= GREPROTO_MAX) return -EINVAL; ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? 0 : -EBUSY; if (ret) return ret; synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(gre_del_protocol); /* Fills in tpi and returns header length to be pulled. * Note that caller must use pskb_may_pull() before pulling GRE header. */ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs) { const struct gre_base_hdr *greh; __be32 *options; int hdr_len; if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr)))) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; tpi->flags = gre_flags_to_tnl_flags(greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); tpi->proto = greh->protocol; options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { if (!skb_checksum_simple_validate(skb)) { skb_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } else if (csum_err) { *csum_err = true; return -EINVAL; } options++; } if (greh->flags & GRE_KEY) { tpi->key = *options; options++; } else { tpi->key = 0; } if (unlikely(greh->flags & GRE_SEQ)) { tpi->seq = *options; options++; } else { tpi->seq = 0; } /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IPv4/IPv6 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { u8 _val, *val; val = skb_header_pointer(skb, nhs + hdr_len, sizeof(_val), &_val); if (!val) return -EINVAL; tpi->proto = proto; if ((*val & 0xF0) != 0x40) hdr_len += 4; } tpi->hdr_len = hdr_len; /* ERSPAN ver 1 and 2 protocol sets GRE key field * to 0 and sets the configured key in the * inner erspan header field */ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) return -EINVAL; ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len); tpi->key = cpu_to_be32(get_session_id(ershdr)); } return hdr_len; } EXPORT_SYMBOL(gre_parse_header); static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; u8 ver; int ret; if (!pskb_may_pull(skb, 12)) goto drop; ver = skb->data[1]&0x7f; if (ver >= GREPROTO_MAX) goto drop; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (!proto || !proto->handler) goto drop_unlock; ret = proto->handler(skb); rcu_read_unlock(); return ret; drop_unlock: rcu_read_unlock(); drop: kfree_skb(skb); return NET_RX_DROP; } static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; int err = 0; if (ver >= GREPROTO_MAX) return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); else err = -EPROTONOSUPPORT; rcu_read_unlock(); return err; } static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, }; static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); return -EAGAIN; } return 0; } static void __exit gre_exit(void) { inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } module_init(gre_init); module_exit(gre_exit); MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_LICENSE("GPL");
7 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match AH parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_ah.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the spi is matched by the range, 0 otherwise */ static inline bool spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_auth_hdr _ah; const struct ip_auth_hdr *ah; const struct ip6t_ah *ahinfo = par->matchinfo; unsigned int ptr = 0; unsigned int hdrlen = 0; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); if (ah == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_authlen(ah); pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen); pr_debug("RES %04X ", ah->reserved); pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi)); pr_debug("IPv6 AH spi %02X ", spi_match(ahinfo->spis[0], ahinfo->spis[1], ntohl(ah->spi), !!(ahinfo->invflags & IP6T_AH_INV_SPI))); pr_debug("len %02X %04X %02X ", ahinfo->hdrlen, hdrlen, (!ahinfo->hdrlen || (ahinfo->hdrlen == hdrlen) ^ !!(ahinfo->invflags & IP6T_AH_INV_LEN))); pr_debug("res %02X %04X %02X\n", ahinfo->hdrres, ah->reserved, !(ahinfo->hdrres && ah->reserved)); return spi_match(ahinfo->spis[0], ahinfo->spis[1], ntohl(ah->spi), !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && (!ahinfo->hdrlen || (ahinfo->hdrlen == hdrlen) ^ !!(ahinfo->invflags & IP6T_AH_INV_LEN)) && !(ahinfo->hdrres && ah->reserved); } static int ah_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ah *ahinfo = par->matchinfo; if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { pr_debug("unknown flags %X\n", ahinfo->invflags); return -EINVAL; } return 0; } static struct xt_match ah_mt6_reg __read_mostly = { .name = "ah", .family = NFPROTO_IPV6, .match = ah_mt6, .matchsize = sizeof(struct ip6t_ah), .checkentry = ah_mt6_check, .me = THIS_MODULE, }; static int __init ah_mt6_init(void) { return xt_register_match(&ah_mt6_reg); } static void __exit ah_mt6_exit(void) { xt_unregister_match(&ah_mt6_reg); } module_init(ah_mt6_init); module_exit(ah_mt6_exit);
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
27423 3 16 27423 3 16 4797 27423 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ #include <asm/processor-flags.h> #ifndef __ASSEMBLY__ #include <asm/nospec-branch.h> /* * Interrupt control: */ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ extern inline unsigned long native_save_fl(void); extern __always_inline unsigned long native_save_fl(void) { unsigned long flags; /* * "=rm" is safe here, because "pop" adjusts the stack before * it evaluates its effective address -- this is part of the * documented behavior of the "pop" instruction. */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" : "=rm" (flags) : /* no input */ : "memory"); return flags; } static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); } static __always_inline void native_irq_enable(void) { asm volatile("sti": : :"memory"); } static __always_inline void native_safe_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static __always_inline void native_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } #endif #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #ifndef __ASSEMBLY__ #include <linux/types.h> static __always_inline unsigned long arch_local_save_flags(void) { return native_save_fl(); } static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); } static __always_inline void arch_local_irq_enable(void) { native_irq_enable(); } /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ static __always_inline void arch_safe_halt(void) { native_safe_halt(); } /* * Used when interrupts are already enabled or to * shutdown the processor: */ static __always_inline void halt(void) { native_halt(); } /* * For spinlocks, etc: */ static __always_inline unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); return flags; } #else #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS pushfq; popq %rax #endif #endif #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLY__ static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } static __always_inline int arch_irqs_disabled(void) { unsigned long flags = arch_local_save_flags(); return arch_irqs_disabled_flags(flags); } static __always_inline void arch_local_irq_restore(unsigned long flags) { if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } #endif /* !__ASSEMBLY__ */ #endif
2 1 1 5 4 5 5 1 5 9 9 6 5 2 1 2 6 4 5 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* MPTCP socket monitoring support * * Copyright (c) 2020 Red Hat * * Author: Paolo Abeni <pabeni@redhat.com> */ #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet_diag.h> #include <net/netlink.h> #include "protocol.h" static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, net_admin); } static int mptcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; struct mptcp_sock *msk = NULL; struct sk_buff *rep; int err = -ENOENT; struct net *net; struct sock *sk; net = sock_net(in_skb->sk); msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]); if (!msk) goto out_nosk; err = -ENOMEM; sk = (struct sock *)msk; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct mptcp_info)) + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: sock_put(sk); out_nosk: return err; } struct mptcp_diag_ctx { long s_slot; long s_num; unsigned int l_slot; unsigned int l_num; }; static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, bool net_admin) { struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; hinfo = net->ipv4.tcp_death_row.hashinfo; for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk; int num = 0; ilb = &hinfo->lhash2[i]; rcu_read_lock(); spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); struct inet_sock *inet = inet_sk(sk); int ret; if (num < diag_ctx->l_num) goto next_listen; if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp")) goto next_listen; sk = ctx->conn; if (!sk || !net_eq(sock_net(sk), net)) goto next_listen; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_listen; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_listen; if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); sock_put(sk); if (ret < 0) { spin_unlock(&ilb->lock); rcu_read_unlock(); diag_ctx->l_slot = i; diag_ctx->l_num = num; return; } diag_ctx->l_num = num + 1; num = 0; next_listen: ++num; } spin_unlock(&ilb->lock); rcu_read_unlock(); cond_resched(); diag_ctx->l_num = 0; } diag_ctx->l_num = 0; diag_ctx->l_slot = i; } static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; struct sock *sk = (struct sock *)msk; int ret = 0; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); next: sock_put(sk); if (ret < 0) { /* will retry on the same position */ diag_ctx->s_num--; break; } cond_resched(); } if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0) mptcp_diag_dump_listeners(skb, cb, r, net_admin); } static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_info *info = _info; r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); if (inet_sk_state_load(sk) == TCP_LISTEN) { struct sock *lsk = READ_ONCE(msk->first); if (lsk) { /* override with settings from tcp listener, * so Send-Q will show accept queue. */ r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog); r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog); } } if (!info) return; mptcp_diag_fill_info(msk, info); } static const struct inet_diag_handler mptcp_diag_handler = { .owner = THIS_MODULE, .dump = mptcp_diag_dump, .dump_one = mptcp_diag_dump_one, .idiag_get_info = mptcp_diag_get_info, .idiag_type = IPPROTO_MPTCP, .idiag_info_size = sizeof(struct mptcp_info), }; static int __init mptcp_diag_init(void) { return inet_diag_register(&mptcp_diag_handler); } static void __exit mptcp_diag_exit(void) { inet_diag_unregister(&mptcp_diag_handler); } module_init(mptcp_diag_init); module_exit(mptcp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_softif_vlan_release); } #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
317 164 29 309 151 3 15 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * * This file is part of the SCTP kernel implementation * * These are the state tables for the SCTP state machine. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state); static const struct sctp_sm_table_entry bug = { .fn = sctp_sf_bug, .name = "sctp_sf_bug" }; #define DO_LOOKUP(_max, _type, _table) \ ({ \ const struct sctp_sm_table_entry *rtn; \ \ if ((event_subtype._type > (_max))) { \ pr_warn("table %p possible attack: event %d exceeds max %d\n", \ _table, event_subtype._type, _max); \ rtn = &bug; \ } else \ rtn = &_table[event_subtype._type][(int)state]; \ \ rtn; \ }) const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(net, event_subtype.chunk, state); case SCTP_EVENT_T_TIMEOUT: return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, timeout_event_table); case SCTP_EVENT_T_OTHER: return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); case SCTP_EVENT_T_PRIMITIVE: return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; } } #define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} #define TYPE_SCTP_DATA { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_DATA */ #define TYPE_SCTP_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \ } /* TYPE_SCTP_INIT */ #define TYPE_SCTP_INIT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_INIT_ACK */ #define TYPE_SCTP_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SACK */ #define TYPE_SCTP_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ /* This should not happen, but we are nice. */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ } /* TYPE_SCTP_HEARTBEAT */ #define TYPE_SCTP_HEARTBEAT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_HEARTBEAT_ACK */ #define TYPE_SCTP_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_pdiscard), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \ } /* TYPE_SCTP_ABORT */ #define TYPE_SCTP_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SHUTDOWN */ #define TYPE_SCTP_SHUTDOWN_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ } /* TYPE_SCTP_SHUTDOWN_ACK */ #define TYPE_SCTP_ERROR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ERROR */ #define TYPE_SCTP_COOKIE_ECHO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ } /* TYPE_SCTP_COOKIE_ECHO */ #define TYPE_SCTP_COOKIE_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_COOKIE_ACK */ #define TYPE_SCTP_ECN_ECNE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_ECNE */ #define TYPE_SCTP_ECN_CWR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_CWR */ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_4_C), \ } /* TYPE_SCTP_SHUTDOWN_COMPLETE */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. * * For base protocol (RFC 2960). */ static const struct sctp_sm_table_entry chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_DATA, TYPE_SCTP_INIT, TYPE_SCTP_INIT_ACK, TYPE_SCTP_SACK, TYPE_SCTP_HEARTBEAT, TYPE_SCTP_HEARTBEAT_ACK, TYPE_SCTP_ABORT, TYPE_SCTP_SHUTDOWN, TYPE_SCTP_SHUTDOWN_ACK, TYPE_SCTP_ERROR, TYPE_SCTP_COOKIE_ECHO, TYPE_SCTP_COOKIE_ACK, TYPE_SCTP_ECN_ECNE, TYPE_SCTP_ECN_CWR, TYPE_SCTP_SHUTDOWN_COMPLETE, }; /* state_fn_t chunk_event_table[][] */ #define TYPE_SCTP_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF */ #define TYPE_SCTP_ASCONF_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF_ACK */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_ASCONF, TYPE_SCTP_ASCONF_ACK, }; /*state_fn_t addip_chunk_event_table[][] */ #define TYPE_SCTP_FWD_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_FWD_TSN */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FWD_TSN, }; /*state_fn_t prsctp_chunk_event_table[][] */ #define TYPE_SCTP_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_RECONF */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_RECONF, }; /*state_fn_t reconf_chunk_event_table[][] */ #define TYPE_SCTP_AUTH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ } /* TYPE_SCTP_AUTH */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_AUTH, }; /*state_fn_t auth_chunk_event_table[][] */ static const struct sctp_sm_table_entry pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), }; /* chunk pad */ static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_ootb), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), }; /* chunk unknown */ #define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ } /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ #define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ } /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ #define TYPE_SCTP_PRIMITIVE_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \ } /* TYPE_SCTP_PRIMITIVE_ABORT */ #define TYPE_SCTP_PRIMITIVE_SEND { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_SEND */ #define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ } /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ #define TYPE_SCTP_PRIMITIVE_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_ASCONF */ #define TYPE_SCTP_PRIMITIVE_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_RECONF */ /* The primary index for this table is the primitive type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_PRIMITIVE_ASSOCIATE, TYPE_SCTP_PRIMITIVE_SHUTDOWN, TYPE_SCTP_PRIMITIVE_ABORT, TYPE_SCTP_PRIMITIVE_SEND, TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, TYPE_SCTP_PRIMITIVE_ASCONF, TYPE_SCTP_PRIMITIVE_RECONF, }; #define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } #define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_OTHER_NO_PENDING_TSN, TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, }; #define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_probe), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state) { if (state > SCTP_STATE_MAX) return &bug; if (cid == SCTP_CID_I_DATA) cid = SCTP_CID_DATA; if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; switch ((u16)cid) { case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: return &prsctp_chunk_event_table[0][state]; case SCTP_CID_ASCONF: return &addip_chunk_event_table[0][state]; case SCTP_CID_ASCONF_ACK: return &addip_chunk_event_table[1][state]; case SCTP_CID_RECONF: return &reconf_chunk_event_table[0][state]; case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; case SCTP_CID_PAD: return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; }
4 12 4 3 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel Network Address Lists * * This file contains network address list functions used to manage ordered * lists of network addresses for use by the NetLabel subsystem. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 */ #ifndef _NETLABEL_ADDRLIST_H #define _NETLABEL_ADDRLIST_H #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/in6.h> #include <linux/audit.h> /** * struct netlbl_af4list - NetLabel IPv4 address list * @addr: IPv4 address * @mask: IPv4 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af4list { __be32 addr; __be32 mask; u32 valid; struct list_head list; }; /** * struct netlbl_af6list - NetLabel IPv6 address list * @addr: IPv6 address * @mask: IPv6 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af6list { struct in6_addr addr; struct in6_addr mask; u32 valid; struct list_head list; }; #define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af4list_entry(i); } return n; } static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; } #define netlbl_af4list_foreach(iter, head) \ for (iter = __af4list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid(iter->list.next, head)) #define netlbl_af4list_foreach_rcu(iter, head) \ for (iter = __af4list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid_rcu(iter->list.next, head)) #define netlbl_af4list_foreach_safe(iter, tmp, head) \ for (iter = __af4list_valid((head)->next, head), \ tmp = __af4list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af4list_valid(iter->list.next, head)) int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head); struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, struct list_head *head); void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); struct netlbl_af4list *netlbl_af4list_search(__be32 addr, struct list_head *head); struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, __be32 mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask); #else static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask) { } #endif #if IS_ENABLED(CONFIG_IPV6) #define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af6list_entry(i); } return n; } static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; } #define netlbl_af6list_foreach(iter, head) \ for (iter = __af6list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid(iter->list.next, head)) #define netlbl_af6list_foreach_rcu(iter, head) \ for (iter = __af6list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid_rcu(iter->list.next, head)) #define netlbl_af6list_foreach_safe(iter, tmp, head) \ for (iter = __af6list_valid((head)->next, head), \ tmp = __af6list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af6list_valid(iter->list.next, head)) int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head); struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, struct list_head *head); struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask); #else static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask) { } #endif #endif /* IPV6 */ #endif
261 257 255 167 165 3 164 4 167 167 1 167 9 158 60 32 50 41 43 10 34 16 16 3 1 3 248 268 268 254 3 10 3 128 166 255 266 9 256 139 167 134 140 273 214 165 57 54 3 27 28 29 29 33 31 31 57 56 54 52 5 11 10 38 8 8 8 279 17 266 17 278 267 17 2 43 3 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This module provides the abstraction for an SCTP transport representing * a remote transport address. For local transport addresses, we just use * union sctp_addr. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/random.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ static struct sctp_transport *sctp_transport_init(struct net *net, struct sctp_transport *peer, const union sctp_addr *addr, gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len); memset(&peer->saddr, 0, sizeof(union sctp_addr)); peer->sack_generation = 0; /* From 6.3.1 RTO Calculation: * * C1) Until an RTT measurement has been made for a packet sent to the * given destination transport address, set RTO to the protocol * parameter 'RTO.Initial'. */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); peer->last_time_heard = 0; peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; /* Initialize the default path max_retrans. */ peer->pathmaxrxt = net->sctp.max_retrans_path; peer->pf_retrans = net->sctp.pf_retrans; INIT_LIST_HEAD(&peer->transmitted); INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0); timer_setup(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, 0); /* Initialize the 64-bit random nonce sent with heartbeat. */ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); return peer; } /* Allocate and initialize a new transport. */ struct sctp_transport *sctp_transport_new(struct net *net, const union sctp_addr *addr, gfp_t gfp) { struct sctp_transport *transport; transport = kzalloc(sizeof(*transport), gfp); if (!transport) goto fail; if (!sctp_transport_init(net, transport, addr, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(transport); return transport; fail_init: kfree(transport); fail: return NULL; } /* This transport is no longer needed. Free up if possible, or * delay until it last reference count. */ void sctp_transport_free(struct sctp_transport *transport) { /* Try to delete the heartbeat timer. */ if (del_timer(&transport->hb_timer)) sctp_transport_put(transport); /* Delete the T3_rtx timer if it's active. * There is no point in not doing this now and letting * structure hang around in memory since we know * the transport is going away. */ if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); if (del_timer(&transport->reconf_timer)) sctp_transport_put(transport); if (del_timer(&transport->probe_timer)) sctp_transport_put(transport); /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) sctp_transport_put(transport); sctp_transport_put(transport); } static void sctp_transport_destroy_rcu(struct rcu_head *head) { struct sctp_transport *transport; transport = container_of(head, struct sctp_transport, rcu); dst_release(transport->dst); kfree(transport); SCTP_DBG_OBJCNT_DEC(transport); } /* Destroy the transport data structure. * Assumes there are no more users of this structure. */ static void sctp_transport_destroy(struct sctp_transport *transport) { if (unlikely(refcount_read(&transport->refcnt))) { WARN(1, "Attempt to destroy undead transport %p!\n", transport); return; } sctp_packet_free(&transport->packet); if (transport->asoc) sctp_association_put(transport->asoc); call_rcu(&transport->rcu, sctp_transport_destroy_rcu); } /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ void sctp_transport_reset_t3_rtx(struct sctp_transport *transport) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R1) Every time a DATA chunk is sent to any address(including a * retransmission), if the T3-rtx timer of that address is not running * start it running so that it will expire after the RTO of that * address. */ if (!timer_pending(&transport->T3_rtx_timer)) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_hb_timer(struct sctp_transport *transport) { unsigned long expires; /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) { if (!timer_pending(&transport->reconf_timer)) if (!mod_timer(&transport->reconf_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } void sctp_transport_reset_probe_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval)) sctp_transport_hold(transport); } void sctp_transport_reset_raise_timer(struct sctp_transport *transport) { if (!mod_timer(&transport->probe_timer, jiffies + transport->probe_interval * 30)) sctp_transport_hold(transport); } /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. */ void sctp_transport_set_owner(struct sctp_transport *transport, struct sctp_association *asoc) { transport->asoc = asoc; sctp_association_hold(asoc); } /* Initialize the pmtu of a transport. */ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ if (!transport->dst || transport->dst->obsolete) { sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); } if (transport->param_flags & SPP_PMTUD_DISABLE) { struct sctp_association *asoc = transport->asoc; if (!transport->pathmtu && asoc && asoc->pathmtu) transport->pathmtu = asoc->pathmtu; if (transport->pathmtu) return; } if (transport->dst) transport->pathmtu = sctp_dst_mtu(transport->dst); else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; sctp_transport_pl_update(transport); } void sctp_transport_pl_send(struct sctp_transport *t) { if (t->pl.probe_count < SCTP_MAX_PROBES) goto out; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } else if (t->pl.state == SCTP_PL_SEARCH) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } else { /* Normal probe failure. */ t->pl.probe_high = t->pl.probe_size; t->pl.probe_size = t->pl.pmtu; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ t->pl.state = SCTP_PL_BASE; /* Search Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } } out: pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.probe_count++; } bool sctp_transport_pl_recv(struct sctp_transport *t) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); t->pl.pmtu = t->pl.probe_size; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */ t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_ERROR) { t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */ t->pl.pmtu = t->pl.probe_size; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_SEARCH) { if (!t->pl.probe_high) { if (t->pl.probe_size < SCTP_MAX_PLPMTU) { t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, SCTP_MAX_PLPMTU); return false; } t->pl.probe_high = SCTP_MAX_PLPMTU; } t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { t->pl.probe_high = 0; t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ t->pl.probe_size = t->pl.pmtu; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); sctp_transport_reset_raise_timer(t); } } else if (t->pl.state == SCTP_PL_COMPLETE) { /* Raise probe_size again after 30 * interval in Search Complete */ t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_MIN_STEP, SCTP_MAX_PLPMTU); } return t->pl.state == SCTP_PL_COMPLETE; } static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu); if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size) return false; if (t->pl.state == SCTP_PL_BASE) { if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } } else if (t->pl.state == SCTP_PL_SEARCH) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Search -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); return true; } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { t->pl.probe_size = pmtu; t->pl.probe_count = 0; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { t->pl.state = SCTP_PL_BASE; /* Complete -> Base */ t->pl.probe_size = SCTP_BASE_PLPMTU; t->pl.probe_count = 0; t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_transport_reset_probe_timer(t); return true; } } return false; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct sock *sk = t->asoc->base.sk; struct dst_entry *dst; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment instead */ pmtu = SCTP_DEFAULT_MINSEGMENT; } pmtu = SCTP_TRUNC4(pmtu); if (sctp_transport_pl_enabled(t)) return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t)); dst = sctp_transport_dst_check(t); if (dst) { struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); union sctp_addr addr; pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); } if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } if (dst) { /* Re-fetch, as under layers may have a higher minimum size */ pmtu = sctp_dst_mtu(dst); change = t->pathmtu != pmtu; } t->pathmtu = pmtu; return change; } /* Caches the dst entry and source address for a transport's destination * address. */ void sctp_transport_route(struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sock *opt) { struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; sctp_transport_dst_release(transport); af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); if (saddr) memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); else af->get_saddr(opt, transport, &transport->fl); sctp_transport_pmtu(transport, sctp_opt2sk(opt)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). */ if (transport->dst && asoc && (!asoc->peer.primary_path || transport == asoc->peer.active_path)) opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk); } /* Hold a reference to a transport. */ int sctp_transport_hold(struct sctp_transport *transport) { return refcount_inc_not_zero(&transport->refcnt); } /* Release a reference to a transport and clean up * if there are no more references. */ void sctp_transport_put(struct sctp_transport *transport) { if (refcount_dec_and_test(&transport->refcnt)) sctp_transport_destroy(transport); } /* Update transport's RTO based on the newly calculated RTT. */ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) { if (unlikely(!tp->rto_pending)) /* We should not be doing any RTO updates unless rto_pending is set. */ pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' */ /* Note: The above algorithm has been rewritten to * express rto_beta and rto_alpha as inverse powers * of two. * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) + (rtt >> net->sctp.rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. */ tp->srtt = rtt; tp->rttvar = rtt >> 1; } /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY. */ if (tp->rttvar == 0) tp->rttvar = SCTP_CLOCK_GRANULARITY; /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */ tp->rto = tp->srtt + (tp->rttvar << 2); /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min * seconds then it is rounded up to RTO.Min seconds. */ if (tp->rto < tp->asoc->rto_min) tp->rto = tp->asoc->rto_min; /* 6.3.1 C7) A maximum value may be placed on RTO provided it is * at least RTO.max seconds. */ if (tp->rto > tp->asoc->rto_max) tp->rto = tp->asoc->rto_max; sctp_max_rto(tp->asoc, tp); tp->rtt = rtt; /* Reset rto_pending so that a new RTT measurement is started when a * new data chunk is sent. */ tp->rto_pending = 0; pr_debug("%s: transport:%p, rtt:%d, srtt:%d rttvar:%d, rto:%ld\n", __func__, tp, rtt, tp->srtt, tp->rttvar, tp->rto); } /* This routine updates the transport's cwnd and partial_bytes_acked * parameters based on the bytes acked in the received SACK. */ void sctp_transport_raise_cwnd(struct sctp_transport *transport, __u32 sack_ctsn, __u32 bytes_acked) { struct sctp_association *asoc = transport->asoc; __u32 cwnd, ssthresh, flight_size, pba, pmtu; cwnd = transport->cwnd; flight_size = transport->flight_size; /* See if we need to exit Fast Recovery first */ if (asoc->fast_recovery && TSN_lte(asoc->fast_recovery_exit, sack_ctsn)) asoc->fast_recovery = 0; ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 4960 7.2.1 * o When cwnd is less than or equal to ssthresh, an SCTP * endpoint MUST use the slow-start algorithm to increase * cwnd only if the current congestion window is being fully * utilized, an incoming SACK advances the Cumulative TSN * Ack Point, and the data sender is not in Fast Recovery. * Only when these three conditions are met can the cwnd be * increased; otherwise, the cwnd MUST not be increased. * If these conditions are met, then cwnd MUST be increased * by, at most, the lesser of 1) the total size of the * previously outstanding DATA chunk(s) acknowledged, and * 2) the destination's path MTU. This upper bound protects * against the ACK-Splitting attack outlined in [SAVAGE99]. */ if (asoc->fast_recovery) return; /* The appropriate cwnd increase algorithm is performed * if, and only if the congestion window is being fully * utilized. Note that RFC4960 Errata 3.22 removed the * other condition on ctsn moving. */ if (flight_size < cwnd) return; if (bytes_acked > pmtu) cwnd += pmtu; else cwnd += bytes_acked; pr_debug("%s: slow start: transport:%p, bytes_acked:%d, " "cwnd:%d, ssthresh:%d, flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } else { /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh, * upon each SACK arrival, increase partial_bytes_acked * by the total number of bytes of all new chunks * acknowledged in that SACK including chunks * acknowledged by the new Cumulative TSN Ack and by Gap * Ack Blocks. (updated by RFC4960 Errata 3.22) * * When partial_bytes_acked is greater than cwnd and * before the arrival of the SACK the sender had less * bytes of data outstanding than cwnd (i.e., before * arrival of the SACK, flightsize was less than cwnd), * reset partial_bytes_acked to cwnd. (RFC 4960 Errata * 3.26) * * When partial_bytes_acked is equal to or greater than * cwnd and before the arrival of the SACK the sender * had cwnd or more bytes of data outstanding (i.e., * before arrival of the SACK, flightsize was greater * than or equal to cwnd), partial_bytes_acked is reset * to (partial_bytes_acked - cwnd). Next, cwnd is * increased by MTU. (RFC 4960 Errata 3.12) */ pba += bytes_acked; if (pba > cwnd && flight_size < cwnd) pba = cwnd; if (pba >= cwnd && flight_size >= cwnd) { pba = pba - cwnd; cwnd += pmtu; } pr_debug("%s: congestion avoidance: transport:%p, " "bytes_acked:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, bytes_acked, cwnd, ssthresh, flight_size, pba); } transport->cwnd = cwnd; transport->partial_bytes_acked = pba; } /* This routine is used to lower the transport's cwnd when congestion is * detected. */ void sctp_transport_lower_cwnd(struct sctp_transport *transport, enum sctp_lower_cwnd reason) { struct sctp_association *asoc = transport->asoc; switch (reason) { case SCTP_LOWER_CWND_T3_RTX: /* RFC 2960 Section 7.2.3, sctpimpguide * When the T3-rtx timer expires on an address, SCTP should * perform slow start by: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = 1*MTU * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = asoc->pathmtu; /* T3-rtx also clears fast recovery */ asoc->fast_recovery = 0; break; case SCTP_LOWER_CWND_FAST_RTX: /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the * destination address(es) to which the missing DATA chunks * were last sent, according to the formula described in * Section 7.2.3. * * RFC 2960 7.2.3, sctpimpguide Upon detection of packet * losses from SACK (see Section 7.2.4), An endpoint * should do the following: * ssthresh = max(cwnd/2, 4*MTU) * cwnd = ssthresh * partial_bytes_acked = 0 */ if (asoc->fast_recovery) return; /* Mark Fast recovery */ asoc->fast_recovery = 1; asoc->fast_recovery_exit = asoc->next_tsn - 1; transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; break; case SCTP_LOWER_CWND_ECNE: /* RFC 2481 Section 6.1.2. * If the sender receives an ECN-Echo ACK packet * then the sender knows that congestion was encountered in the * network on the path from the sender to the receiver. The * indication of congestion should be treated just as a * congestion loss in non-ECN Capable TCP. That is, the TCP * source halves the congestion window "cwnd" and reduces the * slow start threshold "ssthresh". * A critical condition is that TCP does not react to * congestion indications more than once every window of * data (or more loosely more than once every round-trip time). */ if (time_after(jiffies, transport->last_time_ecne_reduced + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } break; case SCTP_LOWER_CWND_INACTIVE: /* RFC 2960 Section 7.2.1, sctpimpguide * When the endpoint does not transmit data on a given * transport address, the cwnd of the transport address * should be adjusted to max(cwnd/2, 4*MTU) per RTO. * NOTE: Although the draft recommends that this check needs * to be done every RTO interval, we do it every hearbeat * interval. */ transport->cwnd = max(transport->cwnd/2, 4*asoc->pathmtu); /* RFC 4960 Errata 3.27.2: also adjust sshthresh */ transport->ssthresh = transport->cwnd; break; } transport->partial_bytes_acked = 0; pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh); } /* Apply Max.Burst limit to the congestion window: * sctpimpguide-05 2.14.2 * D) When the time comes for the sender to * transmit new DATA chunks, the protocol parameter Max.Burst MUST * first be applied to limit how many new DATA chunks may be sent. * The limit is applied by adjusting cwnd as follows: * if ((flightsize+ Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ void sctp_transport_burst_limited(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; u32 old_cwnd = t->cwnd; u32 max_burst_bytes; if (t->burst_limited || asoc->max_burst == 0) return; max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu); if (max_burst_bytes < old_cwnd) { t->cwnd = max_burst_bytes; t->burst_limited = old_cwnd; } } /* Restore the old cwnd congestion window, after the burst had it's * desired effect. */ void sctp_transport_burst_reset(struct sctp_transport *t) { if (t->burst_limited) { t->cwnd = t->burst_limited; t->burst_limited = 0; } } /* What is the next timeout value for this transport? */ unsigned long sctp_transport_timeout(struct sctp_transport *trans) { /* RTO + timer slack +/- 50% of RTO */ unsigned long timeout = trans->rto >> 1; if (trans->state != SCTP_UNCONFIRMED && trans->state != SCTP_PF) timeout += trans->hbinterval; return max_t(unsigned long, timeout, HZ / 5); } /* Reset transport variables to their initial values */ void sctp_transport_reset(struct sctp_transport *t) { struct sctp_association *asoc = t->asoc; /* RFC 2960 (bis), Section 5.2.4 * All the congestion control parameters (e.g., cwnd, ssthresh) * related to this peer MUST be reset to their initial values * (see Section 6.2.1) */ t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); t->burst_limited = 0; t->ssthresh = asoc->peer.i.a_rwnd; t->rto = asoc->rto_initial; sctp_max_rto(asoc, t); t->rtt = 0; t->srtt = 0; t->rttvar = 0; /* Reset these additional variables so that we have a clean slate. */ t->partial_bytes_acked = 0; t->flight_size = 0; t->error_count = 0; t->rto_pending = 0; t->hb_sent = 0; /* Initialize the state information for SFR-CACC */ t->cacc.changeover_active = 0; t->cacc.cycling_changeover = 0; t->cacc.next_tsn_at_change = 0; t->cacc.cacc_saw_newack = 0; } /* Schedule retransmission on the given transport */ void sctp_transport_immediate_rtx(struct sctp_transport *t) { /* Stop pending T3_rtx_timer */ if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); if (!timer_pending(&t->T3_rtx_timer)) { if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) sctp_transport_hold(t); } } /* Drop dst */ void sctp_transport_dst_release(struct sctp_transport *t) { dst_release(t->dst); t->dst = NULL; t->dst_pending_confirm = 0; } /* Schedule neighbour confirm */ void sctp_transport_dst_confirm(struct sctp_transport *t) { t->dst_pending_confirm = 1; }
67 31 36 66 2 12 52 3 51 51 16 2 2 2 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/in6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->ipv6_v6only) { err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) goto error; udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev); return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); /** * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device * @sock: Socket which provides route info * @oif: Index of the output interface * @saddr: Memory to store the src ip address * @key: Tunnel information * @sport: UDP source port * @dport: UDP destination port * @dsfield: The traffic class field * @dst_cache: The dst cache to use for lookup * This function performs a route lookup on a UDP tunnel * * It returns a valid dst pointer and stores src address to be used in * tunnel in param saddr on success, else a pointer encoded error code. */ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache) { struct dst_entry *dst = NULL; struct flowi6 fl6; #ifdef CONFIG_DST_CACHE if (dst_cache) { dst = dst_cache_get_ip6(dst_cache, saddr); if (dst) return dst; } #endif memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; fl6.flowi6_oif = oif; fl6.daddr = key->u.ipv6.dst; fl6.saddr = key->u.ipv6.src; fl6.fl6_sport = sport; fl6.fl6_dport = dport; fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, NULL); if (IS_ERR(dst)) { netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } if (dst->dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); #endif *saddr = fl6.saddr; return dst; } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL");
2 2 2 2 2 1 1 1 1 1 4 16 3 3 3 3 3 3 12 4 1 2 1 5 5 5 2 4 4 2 2 5 4 7 7 7 4 1 3 1 19 1 2 4 1 1 1 4 9 4 5 4 1 2 7 7 16 1 1 4 4 3 7 8 2 3 3 3 3 3 3 3 3 12 12 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_gred.c Generic Random Early Detection queue. * * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 * * 991129: - Bug fix with grio mode * - a better sing. AvgQ mode with Grio(WRED) * - A finer grained VQ dequeue based on suggestion * from Ren Liu * - More error checks * * For all the glorious comments look at include/net/red.h */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/red.h> #define GRED_DEF_PRIO (MAX_DPs / 2) #define GRED_VQ_MASK (MAX_DPs - 1) #define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP) struct gred_sched_data; struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop parameters */ u32 red_flags; /* virtualQ version of red_flags */ u64 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ struct red_parms parms; struct red_vars vars; struct red_stats stats; }; enum { GRED_WRED_MODE = 1, GRED_RIO_MODE, }; struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; unsigned long flags; u32 red_flags; u32 DPs; u32 def; struct red_vars wred_set; struct tc_gred_qopt_offload *opt; }; static inline int gred_wred_mode(struct gred_sched *table) { return test_bit(GRED_WRED_MODE, &table->flags); } static inline void gred_enable_wred_mode(struct gred_sched *table) { __set_bit(GRED_WRED_MODE, &table->flags); } static inline void gred_disable_wred_mode(struct gred_sched *table) { __clear_bit(GRED_WRED_MODE, &table->flags); } static inline int gred_rio_mode(struct gred_sched *table) { return test_bit(GRED_RIO_MODE, &table->flags); } static inline void gred_enable_rio_mode(struct gred_sched *table) { __set_bit(GRED_RIO_MODE, &table->flags); } static inline void gred_disable_rio_mode(struct gred_sched *table) { __clear_bit(GRED_RIO_MODE, &table->flags); } static inline int gred_wred_mode_check(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); int i; /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; int n; if (q == NULL) continue; for (n = i + 1; n < table->DPs; n++) if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } return 0; } static inline unsigned int gred_backlog(struct gred_sched *table, struct gred_sched_data *q, struct Qdisc *sch) { if (gred_wred_mode(table)) return sch->qstats.backlog; else return q->backlog; } static inline u16 tc_index_to_dp(struct sk_buff *skb) { return skb->tc_index & GRED_VQ_MASK; } static inline void gred_load_wred_set(const struct gred_sched *table, struct gred_sched_data *q) { q->vars.qavg = table->wred_set.qavg; q->vars.qidlestart = table->wred_set.qidlestart; } static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; table->wred_set.qidlestart = q->vars.qidlestart; } static int gred_use_ecn(struct gred_sched_data *q) { return q->red_flags & TC_RED_ECN; } static int gred_use_harddrop(struct gred_sched_data *q) { return q->red_flags & TC_RED_HARDDROP; } static bool gred_per_vq_red_flags_used(struct gred_sched *table) { unsigned int i; /* Local per-vq flags couldn't have been set unless global are 0 */ if (table->red_flags) return false; for (i = 0; i < MAX_DPs; i++) if (table->tab[i] && table->tab[i]->red_flags) return true; return false; } static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct gred_sched_data *q = NULL; struct gred_sched *t = qdisc_priv(sch); unsigned long qavg = 0; u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { dp = t->def; q = t->tab[dp]; if (!q) { /* Pass through packets not assigned to a DP * if no default DP has been configured. This * allows for DP flows to be left untouched. */ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; } /* fix tc_index? --could be controversial but needed for requeueing */ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; for (i = 0; i < t->DPs; i++) { if (t->tab[i] && t->tab[i]->prio < q->prio && !red_is_idling(&t->tab[i]->vars)) qavg += t->tab[i]->vars.qavg; } } q->packetsin++; q->bytesin += qdisc_pkt_len(skb); if (gred_wred_mode(t)) gred_load_wred_set(t, q); q->vars.qavg = red_calc_qavg(&q->parms, &q->vars, gred_backlog(t, q, sch)); if (red_is_idling(&q->vars)) red_end_of_idle_period(&q->vars); if (gred_wred_mode(t)) gred_store_wred_set(t, q); switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) { case RED_DONT_MARK: break; case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } q->stats.prob_mark++; break; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); if (gred_use_harddrop(q) || !gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; } q->stats.forced_mark++; break; } if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) { q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } q->stats.pdrop++; drop: return qdisc_drop(skb, sch, to_free); congestion_drop: qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } static struct sk_buff *gred_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); skb = qdisc_dequeue_head(sch); if (skb) { struct gred_sched_data *q; u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", tc_index_to_dp(skb)); } else { q->backlog -= qdisc_pkt_len(skb); if (gred_wred_mode(t)) { if (!sch->qstats.backlog) red_start_of_idle_period(&t->wred_set); } else { if (!q->backlog) red_start_of_idle_period(&q->vars); } } return skb; } return NULL; } static void gred_reset(struct Qdisc *sch) { int i; struct gred_sched *t = qdisc_priv(sch); qdisc_reset_queue(sch); for (i = 0; i < t->DPs; i++) { struct gred_sched_data *q = t->tab[i]; if (!q) continue; red_restart(&q->vars); q->backlog = 0; } } static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) { struct gred_sched *table = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_gred_qopt_offload *opt = table->opt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; memset(opt, 0, sizeof(*opt)); opt->command = command; opt->handle = sch->handle; opt->parent = sch->parent; if (command == TC_GRED_REPLACE) { unsigned int i; opt->set.grio_on = gred_rio_mode(table); opt->set.wred_on = gred_wred_mode(table); opt->set.dp_cnt = table->DPs; opt->set.dp_def = table->def; for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; if (!q) continue; opt->set.tab[i].present = true; opt->set.tab[i].limit = q->limit; opt->set.tab[i].prio = q->prio; opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; opt->set.tab[i].is_ecn = gred_use_ecn(q); opt->set.tab[i].is_harddrop = gred_use_harddrop(q); opt->set.tab[i].probability = q->parms.max_P; opt->set.tab[i].backlog = &q->backlog; } opt->set.qstats = &sch->qstats; } dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); } static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; u64 bytes = 0, packets = 0; unsigned int i; int ret; hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL); if (!hw_stats) return -ENOMEM; hw_stats->command = TC_GRED_STATS; hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; for (i = 0; i < MAX_DPs; i++) { gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload * ended but driver still wants to adjust the values. */ sch_tree_lock(sch); for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } _bstats_update(&sch->bstats, bytes, packets); sch_tree_unlock(sch); kfree(hw_stats); return ret; } static inline void gred_destroy_vq(struct gred_sched_data *q) { kfree(q); } static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; bool red_flags_changed; int i; if (!dps) return -EINVAL; sopt = nla_data(dps); if (sopt->DPs > MAX_DPs) { NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high"); return -EINVAL; } if (sopt->DPs == 0) { NL_SET_ERR_MSG_MOD(extack, "number of virtual queues can't be 0"); return -EINVAL; } if (sopt->def_DP >= sopt->DPs) { NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count"); return -EINVAL; } if (sopt->flags && gred_per_vq_red_flags_used(table)) { NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used"); return -EINVAL; } sch_tree_lock(sch); table->DPs = sopt->DPs; table->def = sopt->def_DP; red_flags_changed = table->red_flags != sopt->flags; table->red_flags = sopt->flags; /* * Every entry point to GRED is synchronized with the above code * and the DP is checked against DPs, i.e. shadowed VQs can no * longer be found so we can unlock right here. */ sch_tree_unlock(sch); if (sopt->grio) { gred_enable_rio_mode(table); gred_disable_wred_mode(table); if (gred_wred_mode_check(sch)) gred_enable_wred_mode(table); } else { gred_disable_rio_mode(table); gred_disable_wred_mode(table); } if (red_flags_changed) for (i = 0; i < table->DPs; i++) if (table->tab[i]) table->tab[i]->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; } } gred_offload(sch, TC_GRED_REPLACE); return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, struct tc_gred_qopt *ctl, int prio, u8 *stab, u32 max_P, struct gred_sched_data **prealloc, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; if (!q) return -ENOMEM; q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; } q->DP = dp; q->prio = prio; if (ctl->limit > sch->limit) q->limit = sch->limit; else q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, stab, max_P); red_set_vars(&q->vars); return 0; } static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = { [TCA_GRED_VQ_DP] = { .type = NLA_U32 }, [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = { [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED }, }; static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, [TCA_GRED_LIMIT] = { .type = NLA_U32 }, [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED }, }; static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) { struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; u32 dp; nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); if (tb[TCA_GRED_VQ_FLAGS]) table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); } static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs) { const struct nlattr *attr; int rem; nla_for_each_nested(attr, vqs, rem) { switch (nla_type(attr)) { case TCA_GRED_VQ_ENTRY: gred_vq_apply(table, attr); break; } } } static int gred_vq_validate(struct gred_sched *table, u32 cdp, const struct nlattr *entry, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; int err; u32 dp; err = nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, extack); if (err < 0) return err; if (!tb[TCA_GRED_VQ_DP]) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified"); return -EINVAL; } dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); if (dp >= table->DPs) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds"); return -EINVAL; } if (dp != cdp && !table->tab[dp]) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated"); return -EINVAL; } if (tb[TCA_GRED_VQ_FLAGS]) { u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); if (table->red_flags && table->red_flags != red_flags) { NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used"); return -EINVAL; } if (red_flags & ~GRED_VQ_RED_FLAGS) { NL_SET_ERR_MSG_MOD(extack, "invalid RED flags specified"); return -EINVAL; } } return 0; } static int gred_vqs_validate(struct gred_sched *table, u32 cdp, struct nlattr *vqs, struct netlink_ext_ack *extack) { const struct nlattr *attr; int rem, err; err = nla_validate_nested_deprecated(vqs, TCA_GRED_VQ_ENTRY_MAX, gred_vqe_policy, extack); if (err < 0) return err; nla_for_each_nested(attr, vqs, rem) { switch (nla_type(attr)) { case TCA_GRED_VQ_ENTRY: err = gred_vq_validate(table, cdp, attr, extack); if (err) return err; break; default: NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes"); return -EINVAL; } } if (rem > 0) { NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list"); return -EINVAL; } return 0; } static int gred_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt *ctl; struct nlattr *tb[TCA_GRED_MAX + 1]; int err, prio = GRED_DEF_PRIO; u8 *stab; u32 max_P; struct gred_sched_data *prealloc; err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } if (tb[TCA_GRED_PARMS] == NULL || tb[TCA_GRED_STAB] == NULL || tb[TCA_GRED_LIMIT] != NULL) { NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time"); return -EINVAL; } max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); if (ctl->DP >= table->DPs) { NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count"); return -EINVAL; } if (tb[TCA_GRED_VQ_LIST]) { err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST], extack); if (err) return err; } if (gred_rio_mode(table)) { if (ctl->prio == 0) { int def_prio = GRED_DEF_PRIO; if (table->tab[table->def]) def_prio = table->tab[table->def]->prio; printk(KERN_DEBUG "GRED: DP %u does not have a prio " "setting default to %d\n", ctl->DP, def_prio); prio = def_prio; } else prio = ctl->prio; } prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, extack); if (err < 0) goto err_unlock_free; if (tb[TCA_GRED_VQ_LIST]) gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]); if (gred_rio_mode(table)) { gred_disable_wred_mode(table); if (gred_wred_mode_check(sch)) gred_enable_wred_mode(table); } sch_tree_unlock(sch); kfree(prealloc); gred_offload(sch, TC_GRED_REPLACE); return 0; err_unlock_free: sch_tree_unlock(sch); kfree(prealloc); return err; } static int gred_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct nlattr *tb[TCA_GRED_MAX + 1]; int err; if (!opt) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) { NL_SET_ERR_MSG_MOD(extack, "virtual queue configuration can't be specified at initialization time"); return -EINVAL; } if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); else sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); if (!table->opt) return -ENOMEM; } return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); struct nlattr *parms, *vqs, *opts = NULL; int i; u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { .DPs = table->DPs, .def_DP = table->def, .grio = gred_rio_mode(table), .flags = table->red_flags, }; if (gred_offload_dump_stats(sch)) goto nla_put_failure; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; max_p[i] = q ? q->parms.max_P : 0; } if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) goto nla_put_failure; /* Old style all-in-one dump of VQs */ parms = nla_nest_start_noflag(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; unsigned long qavg; memset(&opt, 0, sizeof(opt)); if (!q) { /* hack -- fix at some point with proper message This is how we indicate to tc that there is no VQ at this DP */ opt.DP = MAX_DPs + i; goto append_opt; } opt.limit = q->limit; opt.DP = q->DP; opt.backlog = gred_backlog(table, q, sch); opt.prio = q->prio; opt.qth_min = q->parms.qth_min >> q->parms.Wlog; opt.qth_max = q->parms.qth_max >> q->parms.Wlog; opt.Wlog = q->parms.Wlog; opt.Plog = q->parms.Plog; opt.Scell_log = q->parms.Scell_log; opt.early = q->stats.prob_drop; opt.forced = q->stats.forced_drop; opt.pdrop = q->stats.pdrop; opt.packets = q->packetsin; opt.bytesin = q->bytesin; if (gred_wred_mode(table)) gred_load_wred_set(table, q); qavg = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg >> q->parms.Wlog); opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) goto nla_put_failure; } nla_nest_end(skb, parms); /* Dump the VQs again, in more structured way */ vqs = nla_nest_start_noflag(skb, TCA_GRED_VQ_LIST); if (!vqs) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct nlattr *vq; if (!q) continue; vq = nla_nest_start_noflag(skb, TCA_GRED_VQ_ENTRY); if (!vq) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags)) goto nla_put_failure; /* Stats */ if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin, TCA_GRED_VQ_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG, gred_backlog(table, q, sch))) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP, q->stats.prob_drop)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK, q->stats.prob_mark)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP, q->stats.forced_drop)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK, q->stats.forced_mark)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop)) goto nla_put_failure; nla_nest_end(skb, vq); } nla_nest_end(skb, vqs); return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static void gred_destroy(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); int i; for (i = 0; i < table->DPs; i++) gred_destroy_vq(table->tab[i]); gred_offload(sch, TC_GRED_DESTROY); kfree(table->opt); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, .dequeue = gred_dequeue, .peek = qdisc_peek_head, .init = gred_init, .reset = gred_reset, .destroy = gred_destroy, .change = gred_change, .dump = gred_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("gred"); static int __init gred_module_init(void) { return register_qdisc(&gred_qdisc_ops); } static void __exit gred_module_exit(void) { unregister_qdisc(&gred_qdisc_ops); } module_init(gred_module_init) module_exit(gred_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic Random Early Detection qdisc");
1437 54 1425 1420 1412 990 169 47 68 5 16 40 150 4 1 145 229 76 154 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-only /* * Access kernel or user memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <asm/tlb.h> bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return true; } #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; pagefault_disable(); if (!(align & 7)) copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __put_kernel_nofault(dst, src, type, err_label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; pagefault_disable(); if (!(align & 7)) copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { const void *src = unsafe_addr; if (unlikely(count <= 0)) return 0; if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; pagefault_disable(); do { __get_kernel_nofault(dst, src, u8, Efault); dst++; src++; } while (dst[-1] && src - unsafe_addr < count); pagefault_enable(); dst[-1] = '\0'; return src - unsafe_addr; Efault: pagefault_enable(); dst[0] = '\0'; return -EFAULT; } /** * copy_from_user_nofault(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk * * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; if (!__access_ok(src, size)) return ret; if (!nmi_uaccess_okay()) return ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_from_user_nofault); /** * copy_to_user_nofault(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe user address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe user address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { long ret; if (unlikely(count <= 0)) return 0; pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); if (ret >= count) { ret = count; dst[ret - 1] = '\0'; } else if (ret > 0) { ret++; } return ret; } /** * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * * Get the size of a NUL-terminated string in user space without pagefault. * * Returns the size of the string INCLUDING the terminating NUL. * * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { int ret; pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); return ret; } void __copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } EXPORT_SYMBOL(__copy_overflow);
1 1 5 5 4 1 3 3 2 3 3 2 3 3 3 3 6 2 4 2 2 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc **queues; }; static struct Qdisc * multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif band = skb_get_queue_mapping(skb); if (band >= q->bands) return q->queues[0]; return q->queues[band]; } static int multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; qdisc = multiq_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *multiq_dequeue(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ q->curband++; if (q->curband >= q->bands) q->curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } } } return NULL; } static struct sk_buff *multiq_peek(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned int curband = q->curband; struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ curband++; if (curband >= q->bands) curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) return skb; } } return NULL; } static void multiq_reset(struct Qdisc *sch) { u16 band; struct multiq_sched_data *q = qdisc_priv(sch); for (band = 0; band < q->bands; band++) qdisc_reset(q->queues[band]); q->curband = 0; } static void multiq_destroy(struct Qdisc *sch) { int band; struct multiq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_put(q->queues[band]); kfree(q->queues); } static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; struct Qdisc **removed; int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); qopt->bands = qdisc_dev(sch)->real_num_tx_queues; removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), GFP_KERNEL); if (!removed) return -ENOMEM; sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; qdisc_purge_queue(child); removed[n_removed++] = child; } } sch_tree_unlock(sch); for (i = 0; i < n_removed; i++) qdisc_put(removed[i]); kfree(removed); for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (child) { sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; if (child != &noop_qdisc) qdisc_hash_add(child, true); if (old != &noop_qdisc) qdisc_purge_queue(old); sch_tree_unlock(sch); qdisc_put(old); } } } return 0; } static int multiq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); int i, err; q->queues = NULL; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); if (!q->queues) return -ENOBUFS; for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; return multiq_tune(sch, opt, extack); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_multiq_qopt opt; opt.bands = q->bands; opt.max_bands = q->max_bands; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } static struct Qdisc * multiq_leaf(struct Qdisc *sch, unsigned long arg) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long multiq_find(struct Qdisc *sch, u32 classid) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return multiq_find(sch, classid); } static void multiq_unbind(struct Qdisc *q, unsigned long cl) { } static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct multiq_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl - 1]->handle; return 0; } static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct multiq_sched_data *q = qdisc_priv(sch); int band; if (arg->stop) return; for (band = 0; band < q->bands; band++) { if (!tc_qdisc_stats_dump(sch, band + 1, arg)) break; } } static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { .graft = multiq_graft, .leaf = multiq_leaf, .find = multiq_find, .walk = multiq_walk, .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_unbind, .dump = multiq_dump_class, .dump_stats = multiq_dump_class_stats, }; static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &multiq_class_ops, .id = "multiq", .priv_size = sizeof(struct multiq_sched_data), .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, .change = multiq_tune, .dump = multiq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { return register_qdisc(&multiq_qdisc_ops); } static void __exit multiq_module_exit(void) { unregister_qdisc(&multiq_qdisc_ops); } module_init(multiq_module_init) module_exit(multiq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc");
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { char comm[sizeof(current->comm)]; /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", get_task_comm(comm, current), oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; hb1 = futex_hash(&key1); hb2 = futex_hash(&key2); retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb); continue; } futex_q_unlock(hb); __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: *hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(*hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(*hb); ret = -EWOULDBLOCK; } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ /* 6 bucketsize, initval support added */ #define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ipport4_elem { __be32 ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_ipport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipport_init(void) { return ip_set_type_register(&hash_ipport_type); } static void __exit hash_ipport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } module_init(hash_ipport_init); module_exit(hash_ipport_fini);
4768 597 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 /* SPDX-License-Identifier: GPL-2.0 */ /* * security/tomoyo/common.h * * Header file for TOMOYO. * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #ifndef _SECURITY_TOMOYO_COMMON_H #define _SECURITY_TOMOYO_COMMON_H #define pr_fmt(fmt) fmt #include <linux/ctype.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/list.h> #include <linux/cred.h> #include <linux/poll.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/lsm_hooks.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> /********** Constants definitions. **********/ /* * TOMOYO uses this hash only when appending a string into the string * table. Frequency of appending strings is very low. So we don't need * large (e.g. 64k) hash size. 256 will be sufficient. */ #define TOMOYO_HASH_BITS 8 #define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS) /* * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET. * Therefore, we don't need SOCK_MAX. */ #define TOMOYO_SOCK_MAX 6 #define TOMOYO_EXEC_TMPSIZE 4096 /* Garbage collector is trying to kfree() this element. */ #define TOMOYO_GC_IN_PROGRESS -1 /* Profile number is an integer between 0 and 255. */ #define TOMOYO_MAX_PROFILES 256 /* Group number is an integer between 0 and 255. */ #define TOMOYO_MAX_ACL_GROUPS 256 /* Index numbers for "struct tomoyo_condition". */ enum tomoyo_conditions_index { TOMOYO_TASK_UID, /* current_uid() */ TOMOYO_TASK_EUID, /* current_euid() */ TOMOYO_TASK_SUID, /* current_suid() */ TOMOYO_TASK_FSUID, /* current_fsuid() */ TOMOYO_TASK_GID, /* current_gid() */ TOMOYO_TASK_EGID, /* current_egid() */ TOMOYO_TASK_SGID, /* current_sgid() */ TOMOYO_TASK_FSGID, /* current_fsgid() */ TOMOYO_TASK_PID, /* sys_getpid() */ TOMOYO_TASK_PPID, /* sys_getppid() */ TOMOYO_EXEC_ARGC, /* "struct linux_binprm *"->argc */ TOMOYO_EXEC_ENVC, /* "struct linux_binprm *"->envc */ TOMOYO_TYPE_IS_SOCKET, /* S_IFSOCK */ TOMOYO_TYPE_IS_SYMLINK, /* S_IFLNK */ TOMOYO_TYPE_IS_FILE, /* S_IFREG */ TOMOYO_TYPE_IS_BLOCK_DEV, /* S_IFBLK */ TOMOYO_TYPE_IS_DIRECTORY, /* S_IFDIR */ TOMOYO_TYPE_IS_CHAR_DEV, /* S_IFCHR */ TOMOYO_TYPE_IS_FIFO, /* S_IFIFO */ TOMOYO_MODE_SETUID, /* S_ISUID */ TOMOYO_MODE_SETGID, /* S_ISGID */ TOMOYO_MODE_STICKY, /* S_ISVTX */ TOMOYO_MODE_OWNER_READ, /* S_IRUSR */ TOMOYO_MODE_OWNER_WRITE, /* S_IWUSR */ TOMOYO_MODE_OWNER_EXECUTE, /* S_IXUSR */ TOMOYO_MODE_GROUP_READ, /* S_IRGRP */ TOMOYO_MODE_GROUP_WRITE, /* S_IWGRP */ TOMOYO_MODE_GROUP_EXECUTE, /* S_IXGRP */ TOMOYO_MODE_OTHERS_READ, /* S_IROTH */ TOMOYO_MODE_OTHERS_WRITE, /* S_IWOTH */ TOMOYO_MODE_OTHERS_EXECUTE, /* S_IXOTH */ TOMOYO_EXEC_REALPATH, TOMOYO_SYMLINK_TARGET, TOMOYO_PATH1_UID, TOMOYO_PATH1_GID, TOMOYO_PATH1_INO, TOMOYO_PATH1_MAJOR, TOMOYO_PATH1_MINOR, TOMOYO_PATH1_PERM, TOMOYO_PATH1_TYPE, TOMOYO_PATH1_DEV_MAJOR, TOMOYO_PATH1_DEV_MINOR, TOMOYO_PATH2_UID, TOMOYO_PATH2_GID, TOMOYO_PATH2_INO, TOMOYO_PATH2_MAJOR, TOMOYO_PATH2_MINOR, TOMOYO_PATH2_PERM, TOMOYO_PATH2_TYPE, TOMOYO_PATH2_DEV_MAJOR, TOMOYO_PATH2_DEV_MINOR, TOMOYO_PATH1_PARENT_UID, TOMOYO_PATH1_PARENT_GID, TOMOYO_PATH1_PARENT_INO, TOMOYO_PATH1_PARENT_PERM, TOMOYO_PATH2_PARENT_UID, TOMOYO_PATH2_PARENT_GID, TOMOYO_PATH2_PARENT_INO, TOMOYO_PATH2_PARENT_PERM, TOMOYO_MAX_CONDITION_KEYWORD, TOMOYO_NUMBER_UNION, TOMOYO_NAME_UNION, TOMOYO_ARGV_ENTRY, TOMOYO_ENVP_ENTRY, }; /* Index numbers for stat(). */ enum tomoyo_path_stat_index { /* Do not change this order. */ TOMOYO_PATH1, TOMOYO_PATH1_PARENT, TOMOYO_PATH2, TOMOYO_PATH2_PARENT, TOMOYO_MAX_PATH_STAT }; /* Index numbers for operation mode. */ enum tomoyo_mode_index { TOMOYO_CONFIG_DISABLED, TOMOYO_CONFIG_LEARNING, TOMOYO_CONFIG_PERMISSIVE, TOMOYO_CONFIG_ENFORCING, TOMOYO_CONFIG_MAX_MODE, TOMOYO_CONFIG_WANT_REJECT_LOG = 64, TOMOYO_CONFIG_WANT_GRANT_LOG = 128, TOMOYO_CONFIG_USE_DEFAULT = 255, }; /* Index numbers for entry type. */ enum tomoyo_policy_id { TOMOYO_ID_GROUP, TOMOYO_ID_ADDRESS_GROUP, TOMOYO_ID_PATH_GROUP, TOMOYO_ID_NUMBER_GROUP, TOMOYO_ID_TRANSITION_CONTROL, TOMOYO_ID_AGGREGATOR, TOMOYO_ID_MANAGER, TOMOYO_ID_CONDITION, TOMOYO_ID_NAME, TOMOYO_ID_ACL, TOMOYO_ID_DOMAIN, TOMOYO_MAX_POLICY }; /* Index numbers for domain's attributes. */ enum tomoyo_domain_info_flags_index { /* Quota warnning flag. */ TOMOYO_DIF_QUOTA_WARNED, /* * This domain was unable to create a new domain at * tomoyo_find_next_domain() because the name of the domain to be * created was too long or it could not allocate memory. * More than one process continued execve() without domain transition. */ TOMOYO_DIF_TRANSITION_FAILED, TOMOYO_MAX_DOMAIN_INFO_FLAGS }; /* Index numbers for audit type. */ enum tomoyo_grant_log { /* Follow profile's configuration. */ TOMOYO_GRANTLOG_AUTO, /* Do not generate grant log. */ TOMOYO_GRANTLOG_NO, /* Generate grant_log. */ TOMOYO_GRANTLOG_YES, }; /* Index numbers for group entries. */ enum tomoyo_group_id { TOMOYO_PATH_GROUP, TOMOYO_NUMBER_GROUP, TOMOYO_ADDRESS_GROUP, TOMOYO_MAX_GROUP }; /* Index numbers for type of numeric values. */ enum tomoyo_value_type { TOMOYO_VALUE_TYPE_INVALID, TOMOYO_VALUE_TYPE_DECIMAL, TOMOYO_VALUE_TYPE_OCTAL, TOMOYO_VALUE_TYPE_HEXADECIMAL, }; /* Index numbers for domain transition control keywords. */ enum tomoyo_transition_type { /* Do not change this order, */ TOMOYO_TRANSITION_CONTROL_NO_RESET, TOMOYO_TRANSITION_CONTROL_RESET, TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE, TOMOYO_TRANSITION_CONTROL_INITIALIZE, TOMOYO_TRANSITION_CONTROL_NO_KEEP, TOMOYO_TRANSITION_CONTROL_KEEP, TOMOYO_MAX_TRANSITION_TYPE }; /* Index numbers for Access Controls. */ enum tomoyo_acl_entry_type_index { TOMOYO_TYPE_PATH_ACL, TOMOYO_TYPE_PATH2_ACL, TOMOYO_TYPE_PATH_NUMBER_ACL, TOMOYO_TYPE_MKDEV_ACL, TOMOYO_TYPE_MOUNT_ACL, TOMOYO_TYPE_INET_ACL, TOMOYO_TYPE_UNIX_ACL, TOMOYO_TYPE_ENV_ACL, TOMOYO_TYPE_MANUAL_TASK_ACL, }; /* Index numbers for access controls with one pathname. */ enum tomoyo_path_acl_index { TOMOYO_TYPE_EXECUTE, TOMOYO_TYPE_READ, TOMOYO_TYPE_WRITE, TOMOYO_TYPE_APPEND, TOMOYO_TYPE_UNLINK, TOMOYO_TYPE_GETATTR, TOMOYO_TYPE_RMDIR, TOMOYO_TYPE_TRUNCATE, TOMOYO_TYPE_SYMLINK, TOMOYO_TYPE_CHROOT, TOMOYO_TYPE_UMOUNT, TOMOYO_MAX_PATH_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_memory_stat_type { TOMOYO_MEMORY_POLICY, TOMOYO_MEMORY_AUDIT, TOMOYO_MEMORY_QUERY, TOMOYO_MAX_MEMORY_STAT }; enum tomoyo_mkdev_acl_index { TOMOYO_TYPE_MKBLOCK, TOMOYO_TYPE_MKCHAR, TOMOYO_MAX_MKDEV_OPERATION }; /* Index numbers for socket operations. */ enum tomoyo_network_acl_index { TOMOYO_NETWORK_BIND, /* bind() operation. */ TOMOYO_NETWORK_LISTEN, /* listen() operation. */ TOMOYO_NETWORK_CONNECT, /* connect() operation. */ TOMOYO_NETWORK_SEND, /* send() operation. */ TOMOYO_MAX_NETWORK_OPERATION }; /* Index numbers for access controls with two pathnames. */ enum tomoyo_path2_acl_index { TOMOYO_TYPE_LINK, TOMOYO_TYPE_RENAME, TOMOYO_TYPE_PIVOT_ROOT, TOMOYO_MAX_PATH2_OPERATION }; /* Index numbers for access controls with one pathname and one number. */ enum tomoyo_path_number_acl_index { TOMOYO_TYPE_CREATE, TOMOYO_TYPE_MKDIR, TOMOYO_TYPE_MKFIFO, TOMOYO_TYPE_MKSOCK, TOMOYO_TYPE_IOCTL, TOMOYO_TYPE_CHMOD, TOMOYO_TYPE_CHOWN, TOMOYO_TYPE_CHGRP, TOMOYO_MAX_PATH_NUMBER_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */ enum tomoyo_securityfs_interface_index { TOMOYO_DOMAINPOLICY, TOMOYO_EXCEPTIONPOLICY, TOMOYO_PROCESS_STATUS, TOMOYO_STAT, TOMOYO_AUDIT, TOMOYO_VERSION, TOMOYO_PROFILE, TOMOYO_QUERY, TOMOYO_MANAGER }; /* Index numbers for special mount operations. */ enum tomoyo_special_mount { TOMOYO_MOUNT_BIND, /* mount --bind /source /dest */ TOMOYO_MOUNT_MOVE, /* mount --move /old /new */ TOMOYO_MOUNT_REMOUNT, /* mount -o remount /dir */ TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */ TOMOYO_MOUNT_MAKE_PRIVATE, /* mount --make-private /dir */ TOMOYO_MOUNT_MAKE_SLAVE, /* mount --make-slave /dir */ TOMOYO_MOUNT_MAKE_SHARED, /* mount --make-shared /dir */ TOMOYO_MAX_SPECIAL_MOUNT }; /* Index numbers for functionality. */ enum tomoyo_mac_index { TOMOYO_MAC_FILE_EXECUTE, TOMOYO_MAC_FILE_OPEN, TOMOYO_MAC_FILE_CREATE, TOMOYO_MAC_FILE_UNLINK, TOMOYO_MAC_FILE_GETATTR, TOMOYO_MAC_FILE_MKDIR, TOMOYO_MAC_FILE_RMDIR, TOMOYO_MAC_FILE_MKFIFO, TOMOYO_MAC_FILE_MKSOCK, TOMOYO_MAC_FILE_TRUNCATE, TOMOYO_MAC_FILE_SYMLINK, TOMOYO_MAC_FILE_MKBLOCK, TOMOYO_MAC_FILE_MKCHAR, TOMOYO_MAC_FILE_LINK, TOMOYO_MAC_FILE_RENAME, TOMOYO_MAC_FILE_CHMOD, TOMOYO_MAC_FILE_CHOWN, TOMOYO_MAC_FILE_CHGRP, TOMOYO_MAC_FILE_IOCTL, TOMOYO_MAC_FILE_CHROOT, TOMOYO_MAC_FILE_MOUNT, TOMOYO_MAC_FILE_UMOUNT, TOMOYO_MAC_FILE_PIVOT_ROOT, TOMOYO_MAC_NETWORK_INET_STREAM_BIND, TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, TOMOYO_MAC_NETWORK_INET_RAW_BIND, TOMOYO_MAC_NETWORK_INET_RAW_SEND, TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, TOMOYO_MAC_ENVIRON, TOMOYO_MAX_MAC_INDEX }; /* Index numbers for category of functionality. */ enum tomoyo_mac_category_index { TOMOYO_MAC_CATEGORY_FILE, TOMOYO_MAC_CATEGORY_NETWORK, TOMOYO_MAC_CATEGORY_MISC, TOMOYO_MAX_MAC_CATEGORY_INDEX }; /* * Retry this request. Returned by tomoyo_supervisor() if policy violation has * occurred in enforcing mode and the userspace daemon decided to retry. * * We must choose a positive value in order to distinguish "granted" (which is * 0) and "rejected" (which is a negative value) and "retry". */ #define TOMOYO_RETRY_REQUEST 1 /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_policy_stat_type { /* Do not change this order. */ TOMOYO_STAT_POLICY_UPDATES, TOMOYO_STAT_POLICY_LEARNING, /* == TOMOYO_CONFIG_LEARNING */ TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */ TOMOYO_STAT_POLICY_ENFORCING, /* == TOMOYO_CONFIG_ENFORCING */ TOMOYO_MAX_POLICY_STAT }; /* Index numbers for profile's PREFERENCE values. */ enum tomoyo_pref_index { TOMOYO_PREF_MAX_AUDIT_LOG, TOMOYO_PREF_MAX_LEARNING_ENTRY, TOMOYO_MAX_PREF }; /********** Structure definitions. **********/ /* Common header for holding ACL entries. */ struct tomoyo_acl_head { struct list_head list; s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ } __packed; /* Common header for shared entries. */ struct tomoyo_shared_acl_head { struct list_head list; atomic_t users; } __packed; struct tomoyo_policy_namespace; /* Structure for request info. */ struct tomoyo_request_info { /* * For holding parameters specific to operations which deal files. * NULL if not dealing files. */ struct tomoyo_obj_info *obj; /* * For holding parameters specific to execve() request. * NULL if not dealing execve(). */ struct tomoyo_execve *ee; struct tomoyo_domain_info *domain; /* For holding parameters. */ union { struct { const struct tomoyo_path_info *filename; /* For using wildcards at tomoyo_find_next_domain(). */ const struct tomoyo_path_info *matched_path; /* One of values in "enum tomoyo_path_acl_index". */ u8 operation; } path; struct { const struct tomoyo_path_info *filename1; const struct tomoyo_path_info *filename2; /* One of values in "enum tomoyo_path2_acl_index". */ u8 operation; } path2; struct { const struct tomoyo_path_info *filename; unsigned int mode; unsigned int major; unsigned int minor; /* One of values in "enum tomoyo_mkdev_acl_index". */ u8 operation; } mkdev; struct { const struct tomoyo_path_info *filename; unsigned long number; /* * One of values in * "enum tomoyo_path_number_acl_index". */ u8 operation; } path_number; struct { const struct tomoyo_path_info *name; } environ; struct { const __be32 *address; u16 port; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; bool is_ipv6; } inet_network; struct { const struct tomoyo_path_info *address; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; } unix_network; struct { const struct tomoyo_path_info *type; const struct tomoyo_path_info *dir; const struct tomoyo_path_info *dev; unsigned long flags; int need_dev; } mount; struct { const struct tomoyo_path_info *domainname; } task; } param; struct tomoyo_acl_info *matched_acl; u8 param_type; bool granted; u8 retry; u8 profile; u8 mode; /* One of tomoyo_mode_index . */ u8 type; }; /* Structure for holding a token. */ struct tomoyo_path_info { const char *name; u32 hash; /* = full_name_hash(name, strlen(name)) */ u16 const_len; /* = tomoyo_const_part_length(name) */ bool is_dir; /* = tomoyo_strendswith(name, "/") */ bool is_patterned; /* = tomoyo_path_contains_pattern(name) */ }; /* Structure for holding string data. */ struct tomoyo_name { struct tomoyo_shared_acl_head head; struct tomoyo_path_info entry; }; /* Structure for holding a word. */ struct tomoyo_name_union { /* Either @filename or @group is NULL. */ const struct tomoyo_path_info *filename; struct tomoyo_group *group; }; /* Structure for holding a number. */ struct tomoyo_number_union { unsigned long values[2]; struct tomoyo_group *group; /* Maybe NULL. */ /* One of values in "enum tomoyo_value_type". */ u8 value_type[2]; }; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union { struct in6_addr ip[2]; /* Big endian. */ struct tomoyo_group *group; /* Pointer to address group. */ bool is_ipv6; /* Valid only if @group == NULL. */ }; /* Structure for "path_group"/"number_group"/"address_group" directive. */ struct tomoyo_group { struct tomoyo_shared_acl_head head; const struct tomoyo_path_info *group_name; struct list_head member_list; }; /* Structure for "path_group" directive. */ struct tomoyo_path_group { struct tomoyo_acl_head head; const struct tomoyo_path_info *member_name; }; /* Structure for "number_group" directive. */ struct tomoyo_number_group { struct tomoyo_acl_head head; struct tomoyo_number_union number; }; /* Structure for "address_group" directive. */ struct tomoyo_address_group { struct tomoyo_acl_head head; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union address; }; /* Subset of "struct stat". Used by conditional ACL and audit logs. */ struct tomoyo_mini_stat { kuid_t uid; kgid_t gid; ino_t ino; umode_t mode; dev_t dev; dev_t rdev; }; /* Structure for dumping argv[] and envp[] of "struct linux_binprm". */ struct tomoyo_page_dump { struct page *page; /* Previously dumped page. */ char *data; /* Contents of "page". Size is PAGE_SIZE. */ }; /* Structure for attribute checks in addition to pathname checks. */ struct tomoyo_obj_info { /* * True if tomoyo_get_attributes() was already called, false otherwise. */ bool validate_done; /* True if @stat[] is valid. */ bool stat_valid[TOMOYO_MAX_PATH_STAT]; /* First pathname. Initialized with { NULL, NULL } if no path. */ struct path path1; /* Second pathname. Initialized with { NULL, NULL } if no path. */ struct path path2; /* * Information on @path1, @path1's parent directory, @path2, @path2's * parent directory. */ struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT]; /* * Content of symbolic link to be created. NULL for operations other * than symlink(). */ struct tomoyo_path_info *symlink_target; }; /* Structure for argv[]. */ struct tomoyo_argv { unsigned long index; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for envp[]. */ struct tomoyo_envp { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for execve() operation. */ struct tomoyo_execve { struct tomoyo_request_info r; struct tomoyo_obj_info obj; struct linux_binprm *bprm; const struct tomoyo_path_info *transition; /* For dumping argv[] and envp[]. */ struct tomoyo_page_dump dump; /* For temporary use. */ char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ }; /* Structure for entries which follows "struct tomoyo_condition". */ struct tomoyo_condition_element { /* * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail * of the array of this struct. */ u8 left; /* * Right hand operand. A "struct tomoyo_number_union" for * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for * TOMOYO_NAME_UNION is attached to the tail of the array of this * struct. */ u8 right; /* Equation operator. True if equals or overlaps, false otherwise. */ bool equals; }; /* Structure for optional arguments. */ struct tomoyo_condition { struct tomoyo_shared_acl_head head; u32 size; /* Memory size allocated for this entry. */ u16 condc; /* Number of conditions in this struct. */ u16 numbers_count; /* Number of "struct tomoyo_number_union values". */ u16 names_count; /* Number of "struct tomoyo_name_union names". */ u16 argc; /* Number of "struct tomoyo_argv". */ u16 envc; /* Number of "struct tomoyo_envp". */ u8 grant_log; /* One of values in "enum tomoyo_grant_log". */ const struct tomoyo_path_info *transit; /* Maybe NULL. */ /* * struct tomoyo_condition_element condition[condc]; * struct tomoyo_number_union values[numbers_count]; * struct tomoyo_name_union names[names_count]; * struct tomoyo_argv argv[argc]; * struct tomoyo_envp envp[envc]; */ }; /* Common header for individual entries. */ struct tomoyo_acl_info { struct list_head list; struct tomoyo_condition *cond; /* Maybe NULL. */ s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */ } __packed; /* Structure for domain information. */ struct tomoyo_domain_info { struct list_head list; struct list_head acl_info_list; /* Name of this domain. Never NULL. */ const struct tomoyo_path_info *domainname; /* Namespace for this domain. Never NULL. */ struct tomoyo_policy_namespace *ns; /* Group numbers to use. */ unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG]; u8 profile; /* Profile number to use. */ bool is_deleted; /* Delete flag. */ bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; atomic_t users; /* Number of referring tasks. */ }; /* * Structure for "task manual_domain_transition" directive. */ struct tomoyo_task_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */ /* Pointer to domainname. */ const struct tomoyo_path_info *domainname; }; /* * Structure for "file execute", "file read", "file write", "file append", * "file unlink", "file getattr", "file rmdir", "file truncate", * "file symlink", "file chroot" and "file unmount" directive. */ struct tomoyo_path_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */ u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */ struct tomoyo_name_union name; }; /* * Structure for "file create", "file mkdir", "file mkfifo", "file mksock", * "file ioctl", "file chmod", "file chown" and "file chgrp" directive. */ struct tomoyo_path_number_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */ /* Bitmask of values in "enum tomoyo_path_number_acl_index". */ u8 perm; struct tomoyo_name_union name; struct tomoyo_number_union number; }; /* Structure for "file mkblock" and "file mkchar" directive. */ struct tomoyo_mkdev_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */ struct tomoyo_name_union name; struct tomoyo_number_union mode; struct tomoyo_number_union major; struct tomoyo_number_union minor; }; /* * Structure for "file rename", "file link" and "file pivot_root" directive. */ struct tomoyo_path2_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */ struct tomoyo_name_union name1; struct tomoyo_name_union name2; }; /* Structure for "file mount" directive. */ struct tomoyo_mount_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */ struct tomoyo_name_union dev_name; struct tomoyo_name_union dir_name; struct tomoyo_name_union fs_type; struct tomoyo_number_union flags; }; /* Structure for "misc env" directive in domain policy. */ struct tomoyo_env_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_ENV_ACL */ const struct tomoyo_path_info *env; /* environment variable */ }; /* Structure for "network inet" directive. */ struct tomoyo_inet_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_ipaddr_union address; struct tomoyo_number_union port; }; /* Structure for "network unix" directive. */ struct tomoyo_unix_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_name_union name; }; /* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */ struct tomoyo_acl_param { char *data; struct list_head *list; struct tomoyo_policy_namespace *ns; bool is_delete; }; #define TOMOYO_MAX_IO_READ_QUEUE 64 /* * Structure for reading/writing policy via /sys/kernel/security/tomoyo * interfaces. */ struct tomoyo_io_buffer { void (*read)(struct tomoyo_io_buffer *head); int (*write)(struct tomoyo_io_buffer *head); __poll_t (*poll)(struct file *file, poll_table *wait); /* Exclusive lock for this structure. */ struct mutex io_sem; char __user *read_user_buf; size_t read_user_buf_avail; struct { struct list_head *ns; struct list_head *domain; struct list_head *group; struct list_head *acl; size_t avail; unsigned int step; unsigned int query_index; u16 index; u16 cond_index; u8 acl_group_index; u8 cond_step; u8 bit; u8 w_pos; bool eof; bool print_this_domain_only; bool print_transition_related_only; bool print_cond_part; const char *w[TOMOYO_MAX_IO_READ_QUEUE]; } r; struct { struct tomoyo_policy_namespace *ns; /* The position currently writing to. */ struct tomoyo_domain_info *domain; /* Bytes available for writing. */ size_t avail; bool is_delete; } w; /* Buffer for reading. */ char *read_buf; /* Size of read buffer. */ size_t readbuf_size; /* Buffer for writing. */ char *write_buf; /* Size of write buffer. */ size_t writebuf_size; /* Type of this interface. */ enum tomoyo_securityfs_interface_index type; /* Users counter protected by tomoyo_io_buffer_list_lock. */ u8 users; /* List for telling GC not to kfree() elements. */ struct list_head list; }; /* * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/ * "no_keep_domain" keyword. */ struct tomoyo_transition_control { struct tomoyo_acl_head head; u8 type; /* One of values in "enum tomoyo_transition_type". */ /* True if the domainname is tomoyo_get_last_name(). */ bool is_last_name; const struct tomoyo_path_info *domainname; /* Maybe NULL */ const struct tomoyo_path_info *program; /* Maybe NULL */ }; /* Structure for "aggregator" keyword. */ struct tomoyo_aggregator { struct tomoyo_acl_head head; const struct tomoyo_path_info *original_name; const struct tomoyo_path_info *aggregated_name; }; /* Structure for policy manager. */ struct tomoyo_manager { struct tomoyo_acl_head head; /* A path to program or a domainname. */ const struct tomoyo_path_info *manager; }; struct tomoyo_preference { unsigned int learning_max_entry; bool enforcing_verbose; bool learning_verbose; bool permissive_verbose; }; /* Structure for /sys/kernel/security/tomnoyo/profile interface. */ struct tomoyo_profile { const struct tomoyo_path_info *comment; struct tomoyo_preference *learning; struct tomoyo_preference *permissive; struct tomoyo_preference *enforcing; struct tomoyo_preference preference; u8 default_config; u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; unsigned int pref[TOMOYO_MAX_PREF]; }; /* Structure for representing YYYY/MM/DD hh/mm/ss. */ struct tomoyo_time { u16 year; u8 month; u8 day; u8 hour; u8 min; u8 sec; }; /* Structure for policy namespace. */ struct tomoyo_policy_namespace { /* Profile table. Memory is allocated as needed. */ struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES]; /* List of "struct tomoyo_group". */ struct list_head group_list[TOMOYO_MAX_GROUP]; /* List of policy. */ struct list_head policy_list[TOMOYO_MAX_POLICY]; /* The global ACL referred by "use_group" keyword. */ struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS]; /* List for connecting to tomoyo_namespace_list list. */ struct list_head namespace_list; /* Profile version. Currently only 20150505 is defined. */ unsigned int profile_version; /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */ const char *name; }; /* Structure for "struct task_struct"->security. */ struct tomoyo_task { struct tomoyo_domain_info *domain_info; struct tomoyo_domain_info *old_domain_info; }; /********** Function prototypes. **********/ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, const struct tomoyo_group *group); bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr); bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond); bool tomoyo_correct_domain(const unsigned char *domainname); bool tomoyo_correct_path(const char *filename); bool tomoyo_correct_word(const char *string); bool tomoyo_domain_def(const unsigned char *buffer); bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r); bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump); bool tomoyo_memory_ok(void *ptr); bool tomoyo_number_matches_group(const unsigned long min, const unsigned long max, const struct tomoyo_group *group); bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr); bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr); bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr); bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern); bool tomoyo_permstr(const char *string, const char *keyword); bool tomoyo_str_starts(char **src, const char *find); char *tomoyo_encode(const char *str); char *tomoyo_encode2(const char *str, int str_len); char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); char *tomoyo_read_token(struct tomoyo_acl_param *param); char *tomoyo_realpath_from_path(const struct path *path); char *tomoyo_realpath_nofollow(const char *pathname); const char *tomoyo_get_exe(void); const struct tomoyo_path_info *tomoyo_compare_name_union (const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr); const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param); const struct tomoyo_path_info *tomoyo_get_name(const char *name); const struct tomoyo_path_info *tomoyo_path_matches_group (const struct tomoyo_path_info *pathname, const struct tomoyo_group *group); int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag); void tomoyo_close_control(struct tomoyo_io_buffer *head); int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env); int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename); int tomoyo_find_next_domain(struct linux_binprm *bprm); int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index); int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index); int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev); int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page); int tomoyo_open_control(const u8 type, struct file *file); int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2); int tomoyo_path_number_perm(const u8 operation, const struct path *path, unsigned long number); int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target); __poll_t tomoyo_poll_control(struct file *file, poll_table *wait); __poll_t tomoyo_poll_log(struct file *file, poll_table *wait); int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_listen_permission(struct socket *sock); int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size); int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate) (struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)); int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)); int tomoyo_write_aggregator(struct tomoyo_acl_param *param); int tomoyo_write_file(struct tomoyo_acl_param *param); int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_misc(struct tomoyo_acl_param *param); int tomoyo_write_inet_network(struct tomoyo_acl_param *param); int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_unix_network(struct tomoyo_acl_param *param); ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len); ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len); struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param); struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit); struct tomoyo_domain_info *tomoyo_domain(void); struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, const u8 idx); struct tomoyo_policy_namespace *tomoyo_assign_namespace (const char *domainname); struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile); u8 tomoyo_parse_ulong(unsigned long *result, char **str); void *tomoyo_commit_ok(void *data, const unsigned int size); void __init tomoyo_load_builtin_policy(void); void __init tomoyo_mm_init(void); void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)); void tomoyo_check_profile(void); void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); void tomoyo_del_condition(struct list_head *element); void tomoyo_fill_path_info(struct tomoyo_path_info *ptr); void tomoyo_get_attributes(struct tomoyo_obj_info *obj); void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns); void tomoyo_load_policy(const char *filename); void tomoyo_normalize_line(unsigned char *buffer); void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register); void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr); void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type); void tomoyo_put_name_union(struct tomoyo_name_union *ptr); void tomoyo_put_number_union(struct tomoyo_number_union *ptr); void tomoyo_read_log(struct tomoyo_io_buffer *head); void tomoyo_update_stat(const u8 index); void tomoyo_warn_oom(const char *function); void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); /********** External variable definitions. **********/ extern bool tomoyo_policy_loaded; extern int tomoyo_enabled; extern const char * const tomoyo_condition_keyword [TOMOYO_MAX_CONDITION_KEYWORD]; extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE]; extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION]; extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX]; extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION]; extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX]; extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION]; extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION]; extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION]; extern struct list_head tomoyo_condition_list; extern struct list_head tomoyo_domain_list; extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH]; extern struct list_head tomoyo_namespace_list; extern struct mutex tomoyo_policy_lock; extern struct srcu_struct tomoyo_ss; extern struct tomoyo_domain_info tomoyo_kernel_domain; extern struct tomoyo_policy_namespace tomoyo_kernel_namespace; extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; extern struct lsm_blob_sizes tomoyo_blob_sizes; /********** Inlined functions. **********/ /** * tomoyo_read_lock - Take lock for protecting policy. * * Returns index number for tomoyo_read_unlock(). */ static inline int tomoyo_read_lock(void) { return srcu_read_lock(&tomoyo_ss); } /** * tomoyo_read_unlock - Release lock for protecting policy. * * @idx: Index number returned by tomoyo_read_lock(). * * Returns nothing. */ static inline void tomoyo_read_unlock(int idx) { srcu_read_unlock(&tomoyo_ss, idx); } /** * tomoyo_sys_getppid - Copy of getppid(). * * Returns parent process's PID. * * Alpha does not have getppid() defined. To be able to build this module on * Alpha, I have to copy getppid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getppid(void) { pid_t pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } /** * tomoyo_sys_getpid - Copy of getpid(). * * Returns current thread's PID. * * Alpha does not have getpid() defined. To be able to build this module on * Alpha, I have to copy getpid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getpid(void) { return task_tgid_vnr(current); } /** * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure. * * @a: Pointer to "struct tomoyo_path_info". * @b: Pointer to "struct tomoyo_path_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a, const struct tomoyo_path_info *b) { return a->hash != b->hash || strcmp(a->name, b->name); } /** * tomoyo_put_name - Drop reference on "struct tomoyo_name". * * @name: Pointer to "struct tomoyo_path_info". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_name(const struct tomoyo_path_info *name) { if (name) { struct tomoyo_name *ptr = container_of(name, typeof(*ptr), entry); atomic_dec(&ptr->head.users); } } /** * tomoyo_put_condition - Drop reference on "struct tomoyo_condition". * * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_condition(struct tomoyo_condition *cond) { if (cond) atomic_dec(&cond->head.users); } /** * tomoyo_put_group - Drop reference on "struct tomoyo_group". * * @group: Pointer to "struct tomoyo_group". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_group(struct tomoyo_group *group) { if (group) atomic_dec(&group->head.users); } /** * tomoyo_task - Get "struct tomoyo_task" for specified thread. * * @task - Pointer to "struct task_struct". * * Returns pointer to "struct tomoyo_task" for specified thread. */ static inline struct tomoyo_task *tomoyo_task(struct task_struct *task) { return task->security + tomoyo_blob_sizes.lbs_task; } /** * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry. * * @a: Pointer to "struct tomoyo_name_union". * @b: Pointer to "struct tomoyo_name_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_name_union (const struct tomoyo_name_union *a, const struct tomoyo_name_union *b) { return a->filename == b->filename && a->group == b->group; } /** * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry. * * @a: Pointer to "struct tomoyo_number_union". * @b: Pointer to "struct tomoyo_number_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_number_union (const struct tomoyo_number_union *a, const struct tomoyo_number_union *b) { return a->values[0] == b->values[0] && a->values[1] == b->values[1] && a->group == b->group && a->value_type[0] == b->value_type[0] && a->value_type[1] == b->value_type[1]; } /** * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry. * * @a: Pointer to "struct tomoyo_ipaddr_union". * @b: Pointer to "struct tomoyo_ipaddr_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_ipaddr_union (const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b) { return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group && a->is_ipv6 == b->is_ipv6; } /** * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread. * * Returns pointer to "struct tomoyo_policy_namespace" for current thread. */ static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void) { return tomoyo_domain()->ns; } /** * list_for_each_cookie - iterate over a list with cookie. * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_cookie(pos, head) \ if (!pos) \ pos = srcu_dereference((head)->next, &tomoyo_ss); \ for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss)) #endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */
265 264 266 569 570 569 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/buffer_head.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); }
22534 23227 22536 22536 22547 22524 23129 22583 23278 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0+ /* * Restartable sequences system call * * Copyright (C) 2015, Google, Inc., * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> * Copyright (C) 2015-2018, EfficiOS Inc., * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) /* * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing * per-cpu operations. * * It allows user-space to perform update operations on per-cpu data * without requiring heavy-weight atomic operations. * * Detailed algorithm of rseq user-space assembly sequences: * * init(rseq_cs) * cpu = TLS->rseq::cpu_id_start * [1] TLS->rseq::rseq_cs = rseq_cs * [start_ip] ---------------------------- * [2] if (cpu != TLS->rseq::cpu_id) * goto abort_ip; * [3] <last_instruction_in_cs> * [post_commit_ip] ---------------------------- * * The address of jump target abort_ip must be outside the critical * region, i.e.: * * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] * * Steps [2]-[3] (inclusive) need to be a sequence of instructions in * userspace that can handle being interrupted between any of those * instructions, and then resumed to the abort_ip. * * 1. Userspace stores the address of the struct rseq_cs assembly * block descriptor into the rseq_cs field of the registered * struct rseq TLS area. This update is performed through a single * store within the inline assembly instruction sequence. * [start_ip] * * 2. Userspace tests to check whether the current cpu_id field match * the cpu number loaded before start_ip, branching to abort_ip * in case of a mismatch. * * If the sequence is preempted or interrupted by a signal * at or after start_ip and before post_commit_ip, then the kernel * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return * ip to abort_ip before returning to user-space, so the preempted * execution resumes at abort_ip. * * 3. Userspace critical section final instruction before * post_commit_ip is the commit. The critical section is * self-terminating. * [post_commit_ip] * * 4. <success> * * On failure at [2], or if interrupted by preempt or signal delivery * between [1] and [3]: * * [abort_ip] * F1. <failure> */ static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_put_user(node_id, &rseq->node_id, efault_end); unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); trace_rseq_update(t); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). */ if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; /* * Reset node_id to its initial state (0). */ if (put_user(node_id, &t->rseq->node_id)) return -EFAULT; /* * Reset mm_cid to its initial state (0). */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ return 0; } static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; u64 ptr; u32 __user *usig; u32 sig; int ret; #ifdef CONFIG_64BIT if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; if (rseq_cs->start_ip >= TASK_SIZE || rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || rseq_cs->abort_ip >= TASK_SIZE || rseq_cs->version > 0) return -EINVAL; /* Check for overflow. */ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; if (current->rseq_sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); return -EINVAL; } return 0; } static bool rseq_warn_flags(const char *str, u32 flags) { u32 test_flags; if (!flags) return false; test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); return true; } static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* * Load and clear event mask atomically with respect to * scheduler preemption. */ preempt_disable(); event_mask = t->rseq_event_mask; t->rseq_event_mask = 0; preempt_enable(); return !!event_mask; } static int clear_rseq_cs(struct task_struct *t) { /* * The rseq_cs field is set to NULL on preemption or signal * delivery on top of rseq assembly block, as well as on top * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT return put_user(0UL, &t->rseq->rseq_cs); #else if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif } /* * Unsigned comparison will be true when ip >= start_ip, and when * ip < start_ip + post_commit_offset. */ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) { return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; int ret; ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; ret = clear_rseq_cs(t); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; } /* * This resume handler must always be executed between any of: * - preemption, * - signal delivery, * and return to user-space. * * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; /* * regs is NULL if and only if the caller is in a syscall path. Skip * fixup and leave rseq_cs as is so that rseq_sycall() will detect and * kill a misbehaving userspace on debug kernels. */ if (regs) { ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) goto error; } if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; error: sig = ksig ? ksig->sig : 0; force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; if (!t->rseq) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } #endif /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; current->rseq_len = 0; return 0; } if (unlikely(flags)) return -EINVAL; if (current->rseq) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; } /* * If there was no rseq previously registered, ensure the provided rseq * is properly aligned, as communcated to user-space through the ELF * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * * In order to be valid, rseq_len is either the original rseq size, or * large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ rseq_set_notify_resume(current); return 0; }
18 1328 1367 1 1 1311 1339 1324 1327 7 1311 1328 1328 1334 1323 1335 1363 1337 1336 1337 1329 1330 7 1328 8 1333 1327 12 1327 1 1 1 1 1 1 1 1 1 2197 2190 18 19 18 19 19 19 17 16 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; } while (read_seqcount_retry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); return extract_string(&b); } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); buffer += buflen - sz; return memcpy(buffer, temp, sz); } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); prepend_char(&b, '/'); return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; struct prepend_buffer b; int seq = 0; rcu_read_lock(); restart: dentry = d; b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); if (!prepend_name(&b, &dentry->d_name)) break; dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (b.len == p->len) prepend_char(&b, '/'); return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else prepend_char(&b, 0); return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; *pwd = fs->pwd; } while (read_seqcount_retry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; char *page = __getname(); if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); error = -ENOENT; } else { unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); len = PATH_MAX - b.len; if (unlikely(len > PATH_MAX)) error = -ENAMETOOLONG; else if (unlikely(len > size)) error = -ERANGE; else if (copy_to_user(buf, b.buf, len)) error = -EFAULT; else error = len; } __putname(page); return error; }
16 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth kernel library. */ #define pr_fmt(fmt) "Bluetooth: " fmt #include <linux/export.h> #include <net/bluetooth/bluetooth.h> /** * baswap() - Swaps the order of a bd address * @dst: Pointer to a bdaddr_t struct that will store the swapped * bd address. * @src: Pointer to the bdaddr_t struct to be swapped. * * This function reverses the byte order of a Bluetooth device * address. */ void baswap(bdaddr_t *dst, const bdaddr_t *src) { const unsigned char *s = (const unsigned char *)src; unsigned char *d = (unsigned char *)dst; unsigned int i; for (i = 0; i < 6; i++) d[i] = s[5 - i]; } EXPORT_SYMBOL(baswap); /** * bt_to_errno() - Bluetooth error codes to standard errno * @code: Bluetooth error code to be converted * * This function takes a Bluetooth error code as input and convets * it to an equivalent Unix/standard errno value. * * Return: * * If the bt error code is known, an equivalent Unix errno value * is returned. * If the given bt error code is not known, ENOSYS is returned. */ int bt_to_errno(__u16 code) { switch (code) { case 0: return 0; case 0x01: return EBADRQC; case 0x02: return ENOTCONN; case 0x03: return EIO; case 0x04: case 0x3c: return EHOSTDOWN; case 0x05: return EACCES; case 0x06: return EBADE; case 0x07: return ENOMEM; case 0x08: return ETIMEDOUT; case 0x09: return EMLINK; case 0x0a: return EMLINK; case 0x0b: return EALREADY; case 0x0c: return EBUSY; case 0x0d: case 0x0e: case 0x0f: return ECONNREFUSED; case 0x10: return ETIMEDOUT; case 0x11: case 0x27: case 0x29: case 0x20: return EOPNOTSUPP; case 0x12: return EINVAL; case 0x13: case 0x14: case 0x15: return ECONNRESET; case 0x16: return ECONNABORTED; case 0x17: return ELOOP; case 0x18: return EACCES; case 0x1a: return EPROTONOSUPPORT; case 0x1b: return ECONNREFUSED; case 0x19: case 0x1e: case 0x23: case 0x24: case 0x25: return EPROTO; default: return ENOSYS; } } EXPORT_SYMBOL(bt_to_errno); /** * bt_status() - Standard errno value to Bluetooth error code * @err: Unix/standard errno value to be converted * * This function converts a standard/Unix errno value to an * equivalent Bluetooth error code. * * Return: Bluetooth error code. * * If the given errno is not found, 0x1f is returned by default * which indicates an unspecified error. * For err >= 0, no conversion is performed, and the same value * is immediately returned. */ __u8 bt_status(int err) { if (err >= 0) return err; switch (err) { case -EBADRQC: return 0x01; case -ENOTCONN: return 0x02; case -EIO: return 0x03; case -EHOSTDOWN: return 0x04; case -EACCES: return 0x05; case -EBADE: return 0x06; case -ENOMEM: return 0x07; case -ETIMEDOUT: return 0x08; case -EMLINK: return 0x09; case -EALREADY: return 0x0b; case -EBUSY: return 0x0c; case -ECONNREFUSED: return 0x0d; case -EOPNOTSUPP: return 0x11; case -EINVAL: return 0x12; case -ECONNRESET: return 0x13; case -ECONNABORTED: return 0x16; case -ELOOP: return 0x17; case -EPROTONOSUPPORT: return 0x1a; case -EPROTO: return 0x19; default: return 0x1f; } } EXPORT_SYMBOL(bt_status); /** * bt_info() - Log Bluetooth information message * @format: Message's format string */ void bt_info(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_info("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_info); /** * bt_warn() - Log Bluetooth warning message * @format: Message's format string */ void bt_warn(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_warn("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_warn); /** * bt_err() - Log Bluetooth error message * @format: Message's format string */ void bt_err(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_err("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_err); #ifdef CONFIG_BT_FEATURE_DEBUG static bool debug_enable; void bt_dbg_set(bool enable) { debug_enable = enable; } bool bt_dbg_get(void) { return debug_enable; } /** * bt_dbg() - Log Bluetooth debugging message * @format: Message's format string */ void bt_dbg(const char *format, ...) { struct va_format vaf; va_list args; if (likely(!debug_enable)) return; va_start(args, format); vaf.fmt = format; vaf.va = &args; printk(KERN_DEBUG pr_fmt("%pV"), &vaf); va_end(args); } EXPORT_SYMBOL(bt_dbg); #endif /** * bt_warn_ratelimited() - Log rate-limited Bluetooth warning message * @format: Message's format string * * This functions works like bt_warn, but it uses rate limiting * to prevent the message from being logged too often. */ void bt_warn_ratelimited(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_warn_ratelimited("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_warn_ratelimited); /** * bt_err_ratelimited() - Log rate-limited Bluetooth error message * @format: Message's format string * * This functions works like bt_err, but it uses rate limiting * to prevent the message from being logged too often. */ void bt_err_ratelimited(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_err_ratelimited("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_err_ratelimited);
33 22 25 23 16 2 25 25 24 25 25 308 308 16 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); did_revoke = 1; } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } return did_revoke; } /* * journal_clear_revoked_flag clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, REQ_SYNC); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } }
12 1 1 9 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> #include <net/netfilter/nf_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; li.u.ulog.flags = 0; if (info->flags & XT_NFLOG_F_COPY_LEN) li.u.ulog.flags |= NF_LOG_F_COPY_LEN; nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", info->prefix); return XT_CONTINUE; } static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); if (ret != 0 && !par->nft_compat) { request_module("%s", "nfnetlink_log"); ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); } return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) { nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } static struct xt_target nflog_tg_reg __read_mostly = { .name = "NFLOG", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = nflog_tg_check, .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, }; static int __init nflog_tg_init(void) { return xt_register_target(&nflog_tg_reg); } static void __exit nflog_tg_exit(void) { xt_unregister_target(&nflog_tg_reg); } module_init(nflog_tg_init); module_exit(nflog_tg_exit); MODULE_SOFTDEP("pre: nfnetlink_log");
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RCUPDATE_WAIT_H #define _LINUX_SCHED_RCUPDATE_WAIT_H /* * RCU synchronization types and methods: */ #include <linux/rcupdate.h> #include <linux/completion.h> #include <linux/sched.h> /* * Structure allowing asynchronous waiting on RCU. */ struct rcu_synchronize { struct rcu_head head; struct completion completion; }; void wakeme_after_rcu(struct rcu_head *head); void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); #define _wait_rcu_gp(checktiny, ...) \ do { \ call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ __crcu_array, __rs_array); \ } while (0) #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) /** * synchronize_rcu_mult - Wait concurrently for multiple grace periods * @...: List of call_rcu() functions for different grace periods to wait on * * This macro waits concurrently for multiple types of RCU grace periods. * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait * on concurrent RCU and RCU-tasks grace periods. Waiting on a given SRCU * domain requires you to write a wrapper function for that SRCU domain's * call_srcu() function, with this wrapper supplying the pointer to the * corresponding srcu_struct. * * Note that call_rcu_hurry() should be used instead of call_rcu() * because in kernels built with CONFIG_RCU_LAZY=y the delay between the * invocation of call_rcu() and that of the corresponding RCU callback * can be multiple seconds. * * The first argument tells Tiny RCU's _wait_rcu_gp() not to * bother waiting for RCU. The reason for this is because anywhere * synchronize_rcu_mult() can be called is automatically already a full * grace period. */ #define synchronize_rcu_mult(...) \ _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) rcu_read_unlock(); cond_resched(); rcu_read_lock(); #endif } #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
3 3 3 19 3 2 12 179 175 3 176 173 177 181 175 180 174 177 177 172 176 175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> #include "devl_internal.h" EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); static struct devlink *devlinks_xa_get(unsigned long index) { struct devlink *devlink; rcu_read_lock(); devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED); if (!devlink || !devlink_try_get(devlink)) devlink = NULL; rcu_read_unlock(); return devlink; } /* devlink_rels xarray contains 1:1 relationships between * devlink object and related nested devlink instance. * The xarray index is used to get the nested object from * the nested-in object code. */ static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1); #define DEVLINK_REL_IN_USE XA_MARK_0 struct devlink_rel { u32 index; refcount_t refcount; u32 devlink_index; struct { u32 devlink_index; u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; struct delayed_work notify_work; } nested_in; }; static void devlink_rel_free(struct devlink_rel *rel) { xa_erase(&devlink_rels, rel->index); kfree(rel); } static void __devlink_rel_get(struct devlink_rel *rel) { refcount_inc(&rel->refcount); } static void __devlink_rel_put(struct devlink_rel *rel) { if (refcount_dec_and_test(&rel->refcount)) devlink_rel_free(rel); } static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, nested_in.notify_work.work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); if (!devlink) goto rel_put; if (!devl_trylock(devlink)) { devlink_put(devlink); goto reschedule_work; } if (!devl_is_registered(devlink)) { devl_unlock(devlink); devlink_put(devlink); goto rel_put; } if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE)) rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index); rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index); devl_unlock(devlink); devlink_put(devlink); rel_put: __devlink_rel_put(rel); return; reschedule_work: schedule_delayed_work(&rel->nested_in.notify_work, 1); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); schedule_delayed_work(&rel->nested_in.notify_work, 0); } static struct devlink_rel *devlink_rel_alloc(void) { struct devlink_rel *rel; static u32 next; int err; rel = kzalloc(sizeof(*rel), GFP_KERNEL); if (!rel) return ERR_PTR(-ENOMEM); err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); if (err) { kfree(rel); return ERR_PTR(err); } refcount_set(&rel->refcount, 1); INIT_DELAYED_WORK(&rel->nested_in.notify_work, &devlink_rel_nested_in_notify_work); return rel; } static void devlink_rel_put(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink_rel_nested_in_notify_work_schedule(rel); __devlink_rel_put(rel); devlink->rel = NULL; } void devlink_rel_nested_in_clear(u32 rel_index) { xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE); } int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, u32 obj_index, devlink_rel_notify_cb_t *notify_cb, devlink_rel_cleanup_cb_t *cleanup_cb, struct devlink *devlink) { struct devlink_rel *rel = devlink_rel_alloc(); ASSERT_DEVLINK_NOT_REGISTERED(devlink); if (IS_ERR(rel)) return PTR_ERR(rel); rel->devlink_index = devlink->index; rel->nested_in.devlink_index = devlink_index; rel->nested_in.obj_index = obj_index; rel->nested_in.notify_cb = notify_cb; rel->nested_in.cleanup_cb = cleanup_cb; *rel_index = rel->index; xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink->rel = rel; return 0; } /** * devlink_rel_nested_in_notify - Notify the object this devlink * instance is nested in. * @devlink: devlink * * This is called upon network namespace change of devlink instance. * In case this devlink instance is nested in another devlink object, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. * However, since the devlink lock of nested instance is held here, * we would end with wrong devlink instance lock ordering and * deadlock. Therefore the work is utilized to avoid that. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; devlink_rel_nested_in_notify_work_schedule(rel); } static struct devlink_rel *devlink_rel_find(unsigned long rel_index) { return xa_find(&devlink_rels, &rel_index, rel_index, DEVLINK_REL_IN_USE); } static struct devlink *devlink_rel_devlink_get(u32 rel_index) { struct devlink_rel *rel; u32 devlink_index; if (!rel_index) return NULL; xa_lock(&devlink_rels); rel = devlink_rel_find(rel_index); if (rel) devlink_index = rel->devlink_index; xa_unlock(&devlink_rels); if (!rel) return NULL; return devlinks_xa_get(devlink_index); } int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, u32 rel_index, int attrtype, bool *msg_updated) { struct net *net = devlink_net(devlink); struct devlink *rel_devlink; int err; rel_devlink = devlink_rel_devlink_get(rel_index); if (!rel_devlink) return 0; err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype); devlink_put(rel_devlink); if (!err && msg_updated) *msg_updated = true; return err; } void *devlink_priv(struct devlink *devlink) { return &devlink->priv; } EXPORT_SYMBOL_GPL(devlink_priv); struct devlink *priv_to_devlink(void *priv) { return container_of(priv, struct devlink, priv); } EXPORT_SYMBOL_GPL(priv_to_devlink); struct device *devlink_to_dev(const struct devlink *devlink) { return devlink->dev; } EXPORT_SYMBOL_GPL(devlink_to_dev); struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); void devl_assert_locked(struct devlink *devlink) { lockdep_assert_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_assert_locked); #ifdef CONFIG_LOCKDEP /* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ bool devl_lock_is_held(struct devlink *devlink) { return lockdep_is_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock_is_held); #endif void devl_lock(struct devlink *devlink) { mutex_lock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock); int devl_trylock(struct devlink *devlink) { return mutex_trylock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_trylock); void devl_unlock(struct devlink *devlink) { mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_unlock); /** * devlink_try_get() - try to obtain a reference on a devlink instance * @devlink: instance to reference * * Obtain a reference on a devlink instance. A reference on a devlink instance * only implies that it's safe to take the instance lock. It does not imply * that the instance is registered, use devl_is_registered() after taking * the instance lock to check registration status. */ struct devlink *__must_check devlink_try_get(struct devlink *devlink) { if (refcount_inc_not_zero(&devlink->refcount)) return devlink; return NULL; } static void devlink_release(struct work_struct *work) { struct devlink *devlink; devlink = container_of(to_rcu_work(work), struct devlink, rwork); mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); kfree(devlink); } void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) queue_rcu_work(system_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) { struct devlink *devlink = NULL; rcu_read_lock(); retry: devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); if (!devlink) goto unlock; if (!devlink_try_get(devlink)) goto next; if (!net_eq(devlink_net(devlink), net)) { devlink_put(devlink); goto next; } unlock: rcu_read_unlock(); return devlink; next: (*indexp)++; goto retry; } /** * devl_register - Register devlink instance * @devlink: devlink */ int devl_register(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); devl_assert_locked(devlink); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); devlink_rel_nested_in_notify(devlink); return 0; } EXPORT_SYMBOL_GPL(devl_register); void devlink_register(struct devlink *devlink) { devl_lock(devlink); devl_register(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_register); /** * devl_unregister - Unregister devlink instance * @devlink: devlink */ void devl_unregister(struct devlink *devlink) { ASSERT_DEVLINK_REGISTERED(devlink); devl_assert_locked(devlink); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_rel_put(devlink); } EXPORT_SYMBOL_GPL(devl_unregister); void devlink_unregister(struct devlink *devlink) { devl_lock(devlink); devl_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * * @ops: ops * @priv_size: size of user private data * @net: net namespace * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, size_t priv_size, struct net *net, struct device *dev) { struct devlink *devlink; static u32 last_id; int ret; WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); if (ret < 0) goto err_xa_alloc; devlink->dev = get_device(dev); devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); INIT_RCU_WORK(&devlink->rwork, devlink_release); lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); lockdep_set_class(&devlink->lock, &devlink->lock_key); refcount_set(&devlink->refcount, 1); return devlink; err_xa_alloc: kfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->nested_rels); xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->params); xa_destroy(&devlink->ports); xa_erase(&devlinks, devlink->index); devlink_put(devlink); } EXPORT_SYMBOL_GPL(devlink_free); static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; static int __init devlink_init(void) { int err; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; err = genl_register_family(&devlink_nl_family); if (err) goto out_unreg_pernet_subsys; err = register_netdevice_notifier(&devlink_port_netdevice_nb); if (!err) return 0; genl_unregister_family(&devlink_nl_family); out_unreg_pernet_subsys: unregister_pernet_subsys(&devlink_pernet_ops); out: WARN_ON(err); return err; } subsys_initcall(devlink_init);
73 72 95 5 97 5 3 3 9 5 4 3 3 3 7 7 6 9 1 1 56 58 9 8 9 15 14 1 72 71 7 8 5 8 7 4 5 8 8 1 127 79 102 36 87 94 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 // SPDX-License-Identifier: GPL-2.0-only /* * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of CUBIC TCP in * Sangtae Ha, Injong Rhee and Lisong Xu, * "CUBIC: A New TCP-Friendly High-Speed TCP Variant" * in ACM SIGOPS Operating System Review, July 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf * * CUBIC integrates a new slow start algorithm, called HyStart. * The details of HyStart are presented in * Sangtae Ha and Injong Rhee, * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf * * All testing results are available from: * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing * * Unless CUBIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ /* Two methods of hybrid slow start */ #define HYSTART_ACK_TRAIN 0x1 #define HYSTART_DELAY 0x2 /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 #define HYSTART_DELAY_MIN (4000U) /* 4 ms */ #define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh __read_mostly; static int bic_scale __read_mostly = 41; static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; static u64 cube_factor __read_mostly; /* Note parameters that are used for precomputing scale factors are read-only */ module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(bic_scale, int, 0444); MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); module_param(hystart, int, 0644); MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); module_param(hystart_detect, int, 0644); MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); module_param(hystart_ack_delta_us, int, 0644); MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; static inline void bictcp_reset(struct bictcp *ca) { memset(ca, 0, offsetof(struct bictcp, unused)); ca->found = 0; } static inline u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; ca->curr_rtt = ~0U; ca->sample_cnt = 0; } __bpf_kfunc static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (hystart) bictcp_hystart_reset(sk); if (!hystart && initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } __bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; if (after(ca->epoch_start, now)) ca->epoch_start = now; } return; } } /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { u32 x, b, shift; /* * cbrt(x) MSB values for x MSB values in [0..63]. * Precomputed then refined by hand - Willy Tarreau * * For x in [0..63], * v = cbrt(x << 18) - 1 * cbrt(x) = (v[x] + 10) >> 6 */ static const u8 v[] = { /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, }; b = fls64(a); if (b < 7) { /* a in [0..63] */ return ((u32)v[(u32)a] + 35) >> 6; } b = ((b * 84) >> 8) - 1; shift = (a >> (b * 3)); x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; /* * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1))); x = ((x * 341) >> 10); return x; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { ca->bic_K = 0; ca->bic_origin_point = cwnd; } else { /* Compute new K based on * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ ca->bic_K = cubic_root(cube_factor * (ca->last_max_cwnd - cwnd)); ca->bic_origin_point = ca->last_max_cwnd; } } /* cubic function - calc*/ /* calculate c * time^3 / rtt, * while considering overflow in calculation of time^3 * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers * (so all divisions are done by using 32 bit) * also NOTE the unit of those veriables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 * rtt = (srtt >> 3) / HZ * !!! The following code does not have overflow problems, * if the cwnd < 1 million packets !!! */ t = (s32)(tcp_jiffies32 - ca->epoch_start); t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; else offs = t - ca->bic_K; /* c/rtt * (t-K)^3 */ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); if (t < ca->bic_K) /* below origin*/ bic_target = ca->bic_origin_point - delta; else /* above origin*/ bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ if (bic_target > cwnd) { ca->cnt = cwnd / (bic_target - cwnd); } else { ca->cnt = 100 * cwnd; /* very small increment*/ } /* * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown. */ if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; delta = (cwnd * scale) >> 3; while (ca->ack_cnt > delta) { /* update tcp cwnd */ ca->ack_cnt -= delta; ca->tcp_cwnd++; } if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ delta = ca->tcp_cwnd - cwnd; max_cnt = cwnd / delta; if (ca->cnt > max_cnt) ca->cnt = max_cnt; } } /* The maximum rate of cwnd increase CUBIC allows is 1 packet per * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. */ ca->cnt = max(ca->cnt, 2U); } __bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp), acked); tcp_cong_avoid_ai(tp, ca->cnt, acked); } __bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } __bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); bictcp_hystart_reset(sk); } } /* Account for TSO/GRO delays. * Otherwise short RTT flows could get too small ssthresh, since during * slow start we begin with small TSO packets and ca->delay_min would * not account for long aggregation delay when TSO packets get bigger. * Ideally even with a very small RTT we would like to have at least one * TSO packet being sent and received by GRO, and another one in qdisc layer. * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ static u32 hystart_ack_delay(const struct sock *sk) { unsigned long rate; rate = READ_ONCE(sk->sk_pacing_rate); if (!rate) return 0; return min_t(u64, USEC_PER_MSEC, div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate)); } static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 threshold; if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; threshold = ca->delay_min + hystart_ack_delay(sk); /* Hystart ack train triggers if we get ack past * ca->delay_min/2. * Pacing might have delayed packets up to RTT/2 * during slow start. */ if (sk->sk_pacing_status == SK_PACING_NONE) threshold >>= 1; if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", now - ca->round_start, threshold, ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp)); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->curr_rtt > delay) ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } } __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; /* Some calls are for duplicates without timetamps */ if (sample->rtt_us < 0) return; /* Discard delay samples right after fast recovery */ if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = sample->rtt_us; if (delay == 0) delay = 1; /* first time call or link delay decreases */ if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && tcp_in_slow_start(tp) && hystart && tcp_snd_cwnd(tp) >= hystart_low_window) hystart_update(sk, delay); } static struct tcp_congestion_ops cubictcp __read_mostly = { .init = cubictcp_init, .ssthresh = cubictcp_recalc_ssthresh, .cong_avoid = cubictcp_cong_avoid, .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = cubictcp_cwnd_event, .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) #endif #endif BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_cubic_check_kfunc_ids, }; static int __init cubictcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); /* Precompute a bunch of the scaling factors that are used per-packet * based on SRTT of 100ms */ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 / (BICTCP_BETA_SCALE - beta); cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 * so K = cubic_root( (wmax-cwnd)*rtt/c ) * the unit of K is bictcp_HZ=2^10, not HZ * * c = bic_scale >> 10 * rtt = 100ms * * the following code has been designed and tested for * cwnd < 1 million packets * RTT < 100 seconds * HZ < 1,000,00 (corresponding to 10 nano-second) */ /* 1/c * 2^2*bictcp_HZ * srtt */ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&cubictcp); } static void __exit cubictcp_unregister(void) { tcp_unregister_congestion_control(&cubictcp); } module_init(cubictcp_register); module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); MODULE_VERSION("2.3");
11 11 11 3 4 9 4 2 9 12 14 11 8 11 3 9 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 // SPDX-License-Identifier: GPL-2.0-or-later /* * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, * including ChaCha20 (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <crypto/algapi.h> #include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> #include <asm/simd.h> asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { len = min(len, maxblocks * CHACHA_BLOCK_SIZE); return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; } static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; src += CHACHA_BLOCK_SIZE * 8; dst += CHACHA_BLOCK_SIZE * 8; state[12] += 8; } if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 8); return; } if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha_4block_xor_avx512vl(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 4); return; } if (bytes) { chacha_2block_xor_avx512vl(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 2); return; } } if (static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; src += CHACHA_BLOCK_SIZE * 8; dst += CHACHA_BLOCK_SIZE * 8; state[12] += 8; } if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 8); return; } if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 4); return; } if (bytes > CHACHA_BLOCK_SIZE) { chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 2); return; } } while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; src += CHACHA_BLOCK_SIZE * 4; dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } if (bytes > CHACHA_BLOCK_SIZE) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); state[12] += chacha_advance(bytes, 4); return; } if (bytes) { chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); state[12]++; } } void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) { if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { hchacha_block_generic(state, stream, nrounds); } else { kernel_fpu_begin(); hchacha_block_ssse3(state, stream, nrounds); kernel_fpu_end(); } } EXPORT_SYMBOL(hchacha_block_arch); void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) { chacha_init_generic(state, key, iv); } EXPORT_SYMBOL(chacha_init_arch); void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); do { unsigned int todo = min_t(unsigned int, bytes, SZ_4K); kernel_fpu_begin(); chacha_dosimd(state, dst, src, todo, nrounds); kernel_fpu_end(); bytes -= todo; src += todo; dst += todo; } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { u32 state[CHACHA_STATE_WORDS] __aligned(8); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { chacha_crypt_generic(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); } else { kernel_fpu_begin(); chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); kernel_fpu_end(); } err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_simd_stream_xor(req, ctx, req->iv); } static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); u32 state[CHACHA_STATE_WORDS] __aligned(8); struct chacha_ctx subctx; u8 real_iv[16]; chacha_init_generic(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { kernel_fpu_begin(); hchacha_block_ssse3(state, subctx.key, ctx->nrounds); kernel_fpu_end(); } else { hchacha_block_generic(state, subctx.key, ctx->nrounds); } subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); return chacha_simd_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-simd", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = chacha_simd, .decrypt = chacha_simd, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-simd", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-simd", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, }; static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) return 0; static_branch_enable(&chacha_use_simd); if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { static_branch_enable(&chacha_use_avx2); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX512VL) && boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ static_branch_enable(&chacha_use_avx512vl); } return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; } static void __exit chacha_simd_mod_fini(void) { if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); module_exit(chacha_simd_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-simd"); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-simd");
2 2 2 4 6 4 6 6 5 4 4 6 6 5 6 6 5 6 5 5 6 6 3 4 5 6 6 5 5 2 5 11 3 8 8 8 6 5 1 2 1 1 1 2 1 1 2 2 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(true); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(true); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] < pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } /** * task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @tsk: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returned. On failure, ERR_PTR is returned. * We limit it to cgroup1 only. */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) { struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup_root *root; unsigned long flags; rcu_read_lock(); for_each_root(root) { /* cgroup1 only*/ if (root == &cgrp_dfl_root) continue; if (root->hierarchy_id != hierarchy_id) continue; spin_lock_irqsave(&css_set_lock, flags); cgrp = task_cgroup_from_root(tsk, root); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); spin_unlock_irqrestore(&css_set_lock, flags); break; } rcu_read_unlock(); return cgrp; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 0, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1);
19 2 19 2 25 25 25 25 4 1 2 2 2 18 19 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 /* * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <crypto/hash.h> #include <crypto/sha3.h> #include <linux/fips.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/time.h> #include <crypto/internal/rng.h> #include "jitterentropy.h" #define JENT_CONDITIONING_HASH "sha3-256-generic" /*************************************************************************** * Helper function ***************************************************************************/ void *jent_kvzalloc(unsigned int len) { return kvzalloc(len, GFP_KERNEL); } void jent_kvzfree(void *ptr, unsigned int len) { memzero_explicit(ptr, len); kvfree(ptr); } void *jent_zalloc(unsigned int len) { return kzalloc(len, GFP_KERNEL); } void jent_zfree(void *ptr) { kfree_sensitive(ptr); } /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time * stamp must have a sufficiently high resolution. * * Note, if the function returns zero because a given architecture does not * implement a high-resolution time stamp, the RNG code's runtime test * will detect it and will not produce output. */ void jent_get_nstime(__u64 *out) { __u64 tmp = 0; tmp = random_get_entropy(); /* * If random_get_entropy does not return a value, i.e. it is not * implemented for a given architecture, use a clock source. * hoping that there are timers we can work with. */ if (tmp == 0) tmp = ktime_get_ns(); *out = tmp; jent_raw_hires_entropy_store(tmp); } int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, unsigned int addtl_len, __u64 hash_loop_cnt, unsigned int stuck) { struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); u8 intermediary[SHA3_256_DIGEST_SIZE]; __u64 j = 0; int ret; desc->tfm = hash_state_desc->tfm; if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { pr_warn_ratelimited("Unexpected digest size\n"); return -EINVAL; } /* * This loop fills a buffer which is injected into the entropy pool. * The main reason for this loop is to execute something over which we * can perform a timing measurement. The injection of the resulting * data into the pool is performed to ensure the result is used and * the compiler cannot optimize the loop away in case the result is not * used at all. Yet that data is considered "additional information" * considering the terminology from SP800-90A without any entropy. * * Note, it does not matter which or how much data you inject, we are * interested in one Keccack1600 compression operation performed with * the crypto_shash_final. */ for (j = 0; j < hash_loop_cnt; j++) { ret = crypto_shash_init(desc) ?: crypto_shash_update(desc, intermediary, sizeof(intermediary)) ?: crypto_shash_finup(desc, addtl, addtl_len, intermediary); if (ret) goto err; } /* * Inject the data from the previous loop into the pool. This data is * not considered to contain any entropy, but it stirs the pool a bit. */ ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); if (ret) goto err; /* * Insert the time stamp into the hash context representing the pool. * * If the time stamp is stuck, do not finally insert the value into the * entropy pool. Although this operation should not do any harm even * when the time stamp has no entropy, SP800-90B requires that any * conditioning operation to have an identical amount of input data * according to section 3.1.5. */ if (!stuck) { ret = crypto_shash_update(hash_state_desc, (u8 *)&time, sizeof(__u64)); } err: shash_desc_zero(desc); memzero_explicit(intermediary, sizeof(intermediary)); return ret; } int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) { struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; u8 jent_block[SHA3_256_DIGEST_SIZE]; /* Obtain data from entropy pool and re-initialize it */ int ret = crypto_shash_final(hash_state_desc, jent_block) ?: crypto_shash_init(hash_state_desc) ?: crypto_shash_update(hash_state_desc, jent_block, sizeof(jent_block)); if (!ret && dst_len) memcpy(dst, jent_block, dst_len); memzero_explicit(jent_block, sizeof(jent_block)); return ret; } /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; struct crypto_shash *tfm; struct shash_desc *sdesc; }; static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); spin_lock(&rng->jent_lock); if (rng->sdesc) { shash_desc_zero(rng->sdesc); kfree(rng->sdesc); } rng->sdesc = NULL; if (rng->tfm) crypto_free_shash(rng->tfm); rng->tfm = NULL; if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } static int jent_kcapi_init(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); struct crypto_shash *hash; struct shash_desc *sdesc; int size, ret = 0; spin_lock_init(&rng->jent_lock); /* * Use SHA3-256 as conditioner. We allocate only the generic * implementation as we are not interested in high-performance. The * execution time of the SHA3 operation is measured and adds to the * Jitter RNG's unpredictable behavior. If we have a slower hash * implementation, the execution timing variations are larger. When * using a fast implementation, we would need to call it more often * as its variations are lower. */ hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); if (IS_ERR(hash)) { pr_err("Cannot allocate conditioning digest\n"); return PTR_ERR(hash); } rng->tfm = hash; size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); sdesc = kmalloc(size, GFP_KERNEL); if (!sdesc) { ret = -ENOMEM; goto err; } sdesc->tfm = hash; crypto_shash_init(sdesc); rng->sdesc = sdesc; rng->entropy_collector = jent_entropy_collector_alloc(CONFIG_CRYPTO_JITTERENTROPY_OSR, 0, sdesc); if (!rng->entropy_collector) { ret = -ENOMEM; goto err; } spin_lock_init(&rng->jent_lock); return 0; err: jent_kcapi_cleanup(tfm); return ret; } static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct jitterentropy *rng = crypto_rng_ctx(tfm); int ret = 0; spin_lock(&rng->jent_lock); ret = jent_read_entropy(rng->entropy_collector, rdata, dlen); if (ret == -3) { /* Handle permanent health test error */ /* * If the kernel was booted with fips=1, it implies that * the entire kernel acts as a FIPS 140 module. In this case * an SP800-90B permanent health test error is treated as * a FIPS module error. */ if (fips_enabled) panic("Jitter RNG permanent health test failure\n"); pr_err("Jitter RNG permanent health test failure\n"); ret = -EFAULT; } else if (ret == -2) { /* Handle intermittent health test error */ pr_warn_ratelimited("Reset Jitter RNG due to intermittent health test failure\n"); ret = -EAGAIN; } else if (ret == -1) { /* Handle other errors */ ret = -EINVAL; } spin_unlock(&rng->jent_lock); return ret; } static int jent_kcapi_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { return 0; } static struct rng_alg jent_alg = { .generate = jent_kcapi_random, .seed = jent_kcapi_reset, .seedsize = 0, .base = { .cra_name = "jitterentropy_rng", .cra_driver_name = "jitterentropy_rng", .cra_priority = 100, .cra_ctxsize = sizeof(struct jitterentropy), .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, } }; static int __init jent_mod_init(void) { SHASH_DESC_ON_STACK(desc, tfm); struct crypto_shash *tfm; int ret = 0; jent_testing_init(); tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); if (IS_ERR(tfm)) { jent_testing_exit(); return PTR_ERR(tfm); } desc->tfm = tfm; crypto_shash_init(desc); ret = jent_entropy_init(CONFIG_CRYPTO_JITTERENTROPY_OSR, 0, desc, NULL); shash_desc_zero(desc); crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) panic("jitterentropy: Initialization failed with host not compliant with requirements: %d\n", ret); jent_testing_exit(); pr_info("jitterentropy: Initialization failed with host not compliant with requirements: %d\n", ret); return -EFAULT; } return crypto_register_rng(&jent_alg); } static void __exit jent_mod_exit(void) { jent_testing_exit(); crypto_unregister_rng(&jent_alg); } module_init(jent_mod_init); module_exit(jent_mod_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>"); MODULE_DESCRIPTION("Non-physical True Random Number Generator based on CPU Jitter"); MODULE_ALIAS_CRYPTO("jitterentropy_rng");
4 4 4 4 4 4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/termios_internal.h> #include <linux/fs.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This * could do with some rationalisation such as pulling the tty proc function * into this file. */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * @file: to free private_data of * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel naming * policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static void check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; spin_lock(&tty->files_lock); list_for_each(p, &tty->tty_files) { count++; } spin_unlock(&tty->files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); } #endif } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number and also * passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t like * (4, 64) or (188, 1). If no corresponding driver is registered then the * function returns -%ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; mutex_lock(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); goto out; } } /* if here then driver wasn't found */ ret = -ENODEV; out: mutex_unlock(&tty_mutex); return ret; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name and the condition * that the tty driver is capable of polled operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p, *res = NULL; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); mutex_lock(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { res = tty_driver_kref_get(p); *line = tty_line; break; } } mutex_unlock(&tty_mutex); return res; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .llseek = no_llseek, .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function informs the * line discipline if present that the driver is ready to receive more output * data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * tty_release_redirect - Release a redirect on a pty if present * @tty: tty device * * This is available to the pty code so if the master closes, if the slave is a * redirect it can release the redirect. */ static struct file *tty_release_redirect(struct tty_struct *tty) { struct file *f = NULL; spin_lock(&redirect_lock); if (redirect && file_tty(redirect) == tty) { f = redirect; redirect = NULL; } spin_unlock(&redirect_lock); return f; } /** * __tty_hangup - actual handler for hangup events * @tty: tty device * @exit_session: if non-zero, signal all foreground group processes * * This can be called by a "kworker" kernel thread. That is process synchronous * but doesn't hold any locks, so we need to make sure we have the appropriate * locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up device. It * ensures future writes will error and it does the needed line discipline * hangup and signal delivery. The tty object itself remains intact. * * Locking: * * BTM * * * redirect lock for undoing redirection * * file list lock for manipulating list of ttys * * tty_ldiscs_lock from called functions * * termios_rwsem resetting termios data * * tasklist_lock to walk task list for hangup event * * * ->siglock to protect ->signal/->sighand * */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; f = tty_release_redirect(tty); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, * this really needs to change if we want to flush the * workqueue with the lock held. */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl.lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; tty->ctrl.pktstatus = 0; spin_unlock_irq(&tty->ctrl.lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on @tty. Schedule a * hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. We do * this synchronously so that when the syscall returns the process is complete. * That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled %SIGHUP. * * We do this synchronously so that when the syscall returns the process is * complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return: true if the tty has been subject to a vhangup or a carrier loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); void __stop_tty(struct tty_struct *tty) { if (tty->flow.stopped) return; tty->flow.stopped = true; if (tty->ops->stop) tty->ops->stop(tty); } /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called on an already stopped * device and will not re-call the &tty_driver->stop() method. * * This functionality is used by both the line disciplines for halting incoming * flow and by the driver. It may therefore be called from any context, may be * under the tty %atomic_write_lock but not always. * * Locking: * flow.lock */ void stop_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __stop_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(stop_tty); void __start_tty(struct tty_struct *tty) { if (!tty->flow.stopped || tty->flow.tco_stopped) return; tty->flow.stopped = false; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If @tty was previously * stopped and is now being started, the &tty_driver->start() method is invoked * and the line discipline woken. * * Locking: * flow.lock */ void start_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __start_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct tty_struct *tty, bool mtime) { time64_t sec = ktime_get_real_seconds(); struct tty_file_private *priv; spin_lock(&tty->files_lock); list_for_each_entry(priv, &tty->tty_files, list) { struct inode *inode = file_inode(priv->file); struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time.tv_sec) & ~7) { if (mtime) inode_set_mtime(inode, sec, 0); else inode_set_atime(inode, sec, 0); } } spin_unlock(&tty->files_lock); } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { void *cookie = NULL; unsigned long offset = 0; ssize_t retval = 0; size_t copied, count = iov_iter_count(to); u8 kernel_buf[64]; do { ssize_t size = min(count, sizeof(kernel_buf)); size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @iocb: kernel I/O control block * @to: destination for the data read * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple read calls * may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this * situation. */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); ret = -EIO; if (ld->ops->read) ret = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (ret > 0) tty_update_time(tty, false); return ret; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t chunk, count = iov_iter_count(from); ssize_t ret, written = 0; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { u8 *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kvmalloc(chunk, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kvfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = min(chunk, count); ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = ld->ops->write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(tty, true); ret = written; } out: tty_write_unlock(tty); return ret; } #ifdef CONFIG_PRINT_QUOTA_WARNING /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. We * don't put it into the syslog queue right now maybe in the future if really * needed. * * We must still hold the BTM and test the CLOSING flag for the moment. * * This function is DEPRECATED, do not use in new code. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } } #endif static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = iterate_tty_write(ld, tty, file, from); tty_ldisc_deref(ld); return ret; } /** * tty_write - write method for tty device file * @iocb: kernel I/O control block * @from: iov_iter with data to write * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line * discipline write method will not be invoked in parallel for * each device. */ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * @tty: the tty to send to * @ch: xchar to send * * Send a high priority character to the tty even if stopped. * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, u8 ch) { bool was_stopped = tty->flow.stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { static const char ptychar[] = "pqrstuvwxyzabcde"; int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @file: file object * @idx: the minor number * * Return: the tty, if found. If not found, return %NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under the * %tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); /** * tty_standard_install - usual tty->ops->install * @driver: the driver for the tty * @tty: the tty * * If the @driver overrides @tty->ops->install, it still can call this function * to perform the standard install operations. */ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The @tty->index field will be * set by the time this is called. This method is responsible for ensuring any * need additional structures are allocated and configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remove a tty object from the driver tables. The tty->index field will be set * by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Re-opens on master ptys are not allowed and return -%EIO. * * Locking: Caller must hold tty_lock * Return: 0 on success, -errno on error. */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but could also be * an active device. The pty drivers require special handling because of this. * * Locking: * The function is called under the tty_mutex, which protects us from the * tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and a reference * count of 1. If a pair was created for pty/tty use and the other was a pty * master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a failed * open. The new code protects the open with a mutex, so it's really quite * straightforward. The mutex locking can probably be relaxed for the (most * common) case of reopening a tty. * * Return: new tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list of ttys * that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->ctrl.pgrp); put_pid(tty->ctrl.session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than * waste a chunk of memory for each port. */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to the @tty device and if need be let the kref layer * destruct the object for us. */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * @tty: tty device release * @idx: index of the tty device release * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list of ttys * that the driver keeps. */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. This is a * no-op unless %TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the same as * tty_release_struct() except that it also resets %TTY_PORT_KOPENED flag on * @tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is roughly the * reverse of tty_init_dev(). */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references this tty. * There may however be several such references. * * Locking: * Takes BKL. See tty_release_dev(). * * Even releasing the tty structures is a tricky business. We have to be very * careful that the structures are all released at the same time, as interrupts * might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->ctrl.session); if (o_tty) session_clear_tty(o_tty->ctrl.session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty * cannot be re-opened by a racing opener. */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because devpts * will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * * If returned value is not erroneous, the caller is responsible to decrement * the refcount by tty_driver_kref_put(). * * Locking: %tty_mutex protects get_tty_driver() * * Return: driver for this inode (with increased refcount) */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } static struct tty_struct *tty_kopen(dev_t device, int shared) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty) || shared) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } /** * tty_kopen_exclusive - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, makes sure * it's not already opened and performs the first-time tty initialization. * * Claims the global %tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized &tty_struct */ struct tty_struct *tty_kopen_exclusive(dev_t device) { return tty_kopen(device, 0); } EXPORT_SYMBOL_GPL(tty_kopen_exclusive); /** * tty_kopen_shared - open a tty device for shared in-kernel use * @device: dev_t of device to open * * Opens an already existing tty for in-kernel use. Compared to * tty_kopen_exclusive() above it doesn't ensure to be the only user. * * Locking: identical to tty_kopen() above. */ struct tty_struct *tty_kopen_shared(dev_t device) { return tty_kopen(device, 1); } EXPORT_SYMBOL_GPL(tty_kopen_shared); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise performs the * first-time tty initialization. * * * Claims the global tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized or re-opened &tty_struct */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open() and tty_release() keep up the tty count that contains the number * of opens done on a tty. We cannot use the inode-count, as different inodes * might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping track of * serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on the first open so that settings don't * persist across reuse. * * Locking: * * %tty_mutex protects tty, tty_lookup_driver() and tty_init_dev(). * * @tty->count should protect the rest. * * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to %tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll status of the * device. * * Locking: locks called line discipline but ldisc poll method may be * re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl.lock, flags); if (tty->ctrl.pgrp) { pid = tty->ctrl.pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl.lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and input management. * * FIXME: does not honour flow control ?? * * Locking: * * Called functions take tty_ldiscs_lock * * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, u8 __user *p) { struct tty_ldisc *ld; u8 ch; if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, NULL, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: @tty->winsize_mutex is taken to ensure the winsize data is * consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { int err; mutex_lock(&tty->winsize_mutex); err = copy_to_user(arg, &tty->winsize, sizeof(*arg)); mutex_unlock(&tty->winsize_mutex); return err ? -EFAULT : 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to peform a * terminal resize correctly. */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; /* Lock the tty */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; done: mutex_unlock(&tty->winsize_mutex); return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally this is * just advisory information but for the Linux console it actually has driver * level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the tty termios * mutex and ctrl.lock. The console takes its own lock then calls into the * default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device. * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; spin_lock(&redirect_lock); if (redirect) { spin_unlock(&redirect_lock); return -EBUSY; } redirect = get_file(file); spin_unlock(&redirect_lock); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc(), this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline is changing * or the @tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level timed * break functionality. * * Locking: * @tty->atomic_write_lock serializes */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } else if (retval == -EOPNOTSUPP) { /* some drivers can tell only dynamically */ retval = 0; } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_get_tiocm - get tiocm status register * @tty: tty device * * Obtain the modem status bits from the tty driver if the feature * is supported. */ int tty_get_tiocm(struct tty_struct *tty) { int retval = -ENOTTY; if (tty->ops->tiocmget) retval = tty->ops->tiocmget(tty); return retval; } EXPORT_SYMBOL_GPL(tty_get_tiocm); /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature is * supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval; retval = tty_get_tiocm(tty); if (retval >= 0) retval = put_user(retval, p); return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } /** * tty_get_icount - get tty statistics * @tty: tty device * @icount: output parameter * * Gets a copy of the @tty's icount statistics. * * Locking: none (up to the driver) */ int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { memset(icount, 0, sizeof(*icount)); if (tty->ops->get_icount) return tty->ops->get_icount(tty, icount); else return -ENOTTY; } EXPORT_SYMBOL_GPL(tty_get_icount); static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { struct serial_icounter_struct icount; int retval; retval = tty_get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { char comm[TASK_COMM_LEN]; int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, get_task_comm(comm, current), flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, ss); } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; return tty_set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; return tty_set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { struct task_struct *g, *p; struct pid *session; int i; unsigned long flags; spin_lock_irqsave(&tty->ctrl.lock, flags); session = get_pid(tty->ctrl.session); spin_unlock_irqrestore(&tty->ctrl.lock, flags); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ for_each_process_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } task_lock(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } task_unlock(p); } read_unlock(&tasklist_lock); put_pid(session); } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(&tty_class, devt); } /** * alloc_tty_struct - allocate a new tty * @driver: driver which will handle the returned tty * @idx: minor of the tty * * This subroutine allocates and initializes a tty structure. * * Locking: none - @tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT); if (!tty) return NULL; kref_init(&tty->kref); if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl.lock); spin_lock_init(&tty->flow.lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character to write * * Write one byte to the @tty using the provided @tty->ops->put_char() method * if present. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method * * Return: the number of characters successfully output. */ int tty_put_char(struct tty_struct *tty, u8 ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * This call is required to be made to register an individual tty device * if the tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to %NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * This call is required to be made to register an individual tty device if the * tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If that bit is * not set, this function should not be called by a tty driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = &tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of %TTY_DRIVER_ flags, will be set in driver->flags * * This should not be called directly, some of the provided macros should be * used instead. Use IS_ERR() and friends on @retval. */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } /** * tty_driver_kref_put - drop a reference to a tty driver * @driver: driver of which to drop the reference * * The final put will destroy and free up the driver. */ void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); /** * tty_register_driver - register a tty driver * @driver: driver to register * * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } mutex_lock(&tty_mutex); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /** * tty_unregister_driver - unregister a tty driver * @driver: driver to unregister * * Called by a tty driver to unregister itself. */ void tty_unregister_driver(struct tty_driver *driver) { unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(const struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } const struct class tty_class = { .name = "tty", .devnode = tty_devnode, }; static int __init tty_class_init(void) { return class_register(&tty_class); } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; /* * Hold the console_list_lock to guarantee that no consoles are * unregistered until all console processing is complete. * This also allows safe traversal of the console list and * race-free reading of @flags. */ console_list_lock(); for_each_console(c) { if (!c->device) continue; if (!c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); console_list_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } static struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, .maxlen = sizeof(tty_legacy_tiocsti), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "ldisc_autoload", .data = &tty_ldisc_autoload, .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { register_sysctl_init("dev/tty", tty_table); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; }
12 5 13 5 4 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM l2tp #if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_L2TP_H #include <linux/tracepoint.h> #include <linux/l2tp.h> #include "l2tp_core.h" #define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e } #define show_encap_type_name(val) \ __print_symbolic(val, \ encap_type_name(UDP), \ encap_type_name(IP)) #define pw_type_name(p) { L2TP_PWTYPE_##p, #p } #define show_pw_type_name(val) \ __print_symbolic(val, \ pw_type_name(ETH_VLAN), \ pw_type_name(ETH), \ pw_type_name(PPP), \ pw_type_name(PPP_AC), \ pw_type_name(IP)) DECLARE_EVENT_CLASS(tunnel_only_evt, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); ), TP_printk("%s", __entry->name) ); DECLARE_EVENT_CLASS(session_only_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); ), TP_printk("%s", __entry->name) ); TRACE_EVENT(register_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel), TP_STRUCT__entry( __array(char, name, L2TP_TUNNEL_NAME_MAX) __field(int, fd) __field(u32, tid) __field(u32, ptid) __field(int, version) __field(enum l2tp_encap_type, encap) ), TP_fast_assign( memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX); __entry->fd = tunnel->fd; __entry->tid = tunnel->tunnel_id; __entry->ptid = tunnel->peer_tunnel_id; __entry->version = tunnel->version; __entry->encap = tunnel->encap; ), TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d", __entry->name, __entry->fd > 0 ? "managed" : "unmanaged", show_encap_type_name(__entry->encap), __entry->version, __entry->tid, __entry->ptid, __entry->fd) ); DEFINE_EVENT(tunnel_only_evt, delete_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); DEFINE_EVENT(tunnel_only_evt, free_tunnel, TP_PROTO(struct l2tp_tunnel *tunnel), TP_ARGS(tunnel) ); TRACE_EVENT(register_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, tid) __field(u32, ptid) __field(u32, sid) __field(u32, psid) __field(enum l2tp_pwtype, pwtype) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0; __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0; __entry->sid = session->session_id; __entry->psid = session->peer_session_id; __entry->pwtype = session->pwtype; ), TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u", __entry->name, show_pw_type_name(__entry->pwtype), __entry->sid, __entry->psid, __entry->sid, __entry->psid) ); DEFINE_EVENT(session_only_evt, delete_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, free_session, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_seqnum_evt, TP_PROTO(struct l2tp_session *session), TP_ARGS(session), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, ns) __field(u32, nr) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->ns = session->ns; __entry->nr = session->nr; ), TP_printk("%s: ns=%u nr=%u", __entry->name, __entry->ns, __entry->nr) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_update, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset, TP_PROTO(struct l2tp_session *session), TP_ARGS(session) ); DECLARE_EVENT_CLASS(session_pkt_discard_evt, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns), TP_STRUCT__entry( __array(char, name, L2TP_SESSION_NAME_MAX) __field(u32, pkt_ns) __field(u32, my_nr) __field(u32, reorder_q_len) ), TP_fast_assign( memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX); __entry->pkt_ns = pkt_ns, __entry->my_nr = session->nr; __entry->reorder_q_len = skb_queue_len(&session->reorder_q); ), TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u", __entry->name, __entry->pkt_ns, __entry->my_nr, __entry->reorder_q_len) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos, TP_PROTO(struct l2tp_session *session, u32 pkt_ns), TP_ARGS(session, pkt_ns) ); #endif /* _TRACE_L2TP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
80 81 82 80 80 82 52 52 11 72 75 73 73 72 70 83 82 43 42 45 390 382 82 86 81 83 81 84 86 86 86 86 6 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); /** * dev_add_offload - register offload handlers * @po: protocol offload declaration * * Add protocol offload handlers to the networking stack. The passed * &proto_offload is linked into kernel lists and may not be freed until * it has been removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new offload handlers (until the next received packet). */ void dev_add_offload(struct packet_offload *po) { struct packet_offload *elem; spin_lock(&offload_lock); list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); /** * __dev_remove_offload - remove offload handler * @po: packet offload declaration * * Remove a protocol offload handler that was previously added to the * kernel offload handlers by dev_add_offload(). The passed &offload_type * is removed from the kernel lists and can be freed or reused once this * function returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); list_for_each_entry(po1, head, list) { if (po == po1) { list_del_rcu(&po->list); goto out; } } pr_warn("dev_remove_offload: %p not found\n", po); out: spin_unlock(&offload_lock); } /** * dev_remove_offload - remove packet offload handler * @po: packet offload declaration * * Remove a packet offload handler that was previously added to the kernel * offload handlers by dev_add_offload(). The passed &offload_type is * removed from the kernel lists and can be freed or reused once this * function returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_offload(struct packet_offload *po) { __dev_remove_offload(po); synchronize_net(); } EXPORT_SYMBOL(dev_remove_offload); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; int segs; /* Do not splice page pool based packets w/ non-page pool * packets. This can result in reference count issues as page * pool pages will not decrement the reference count and will * instead be immediately returned to the pool or have frag * count decremented. */ if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ gro_max_size = p->protocol == htons(ETH_P_IPV6) ? READ_ONCE(p->dev->gro_max_size) : READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || (p->protocol == htons(ETH_P_IPV6) && skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; if (nr_frags > MAX_SKB_FRAGS) goto merge; offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done; } else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: /* sk owenrship - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; } __skb_pull(skb, offset); if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; __skb_header_release(skb); lp = p; done: NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, 0); break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } if (!napi->gro_hash[index].count) __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { unsigned long bitmask = napi->gro_bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, unsigned long diffs) { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *skb_ext; struct tc_skb_ext *p_ext; skb_ext = skb_ext_find(skb, TC_SKB_EXT); p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; #endif return diffs; } static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); struct sk_buff *p; list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_all ^ skb->vlan_all; diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); /* in most common scenarions 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; } } static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo; const skb_frag_t *frag0; unsigned int headlen; NAPI_GRO_CB(skb)->data_offset = 0; headlen = skb_headlen(skb); NAPI_GRO_CB(skb)->frag0 = skb->data; NAPI_GRO_CB(skb)->frag0_len = headlen; if (headlen) return; pinfo = skb_shinfo(skb); frag0 = &pinfo->frags[0]; if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), skb->end - skb->tail); } } static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); BUG_ON(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); skb->data_len -= grow; skb->tail += grow; skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { skb_frag_unref(skb, 0); memmove(pinfo->frags, pinfo->frags + 1, --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } static void gro_try_pull_from_frag0(struct sk_buff *skb) { int grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) gro_pull_from_frag0(skb, grow); } static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; oldest = list_last_entry(head, struct sk_buff, list); /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) return; /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ skb_list_del_init(oldest); napi_gro_complete(napi, oldest); } static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; if (netif_elide_gro(skb->dev)) goto normal; gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type == type && ptype->callbacks.gro_receive) goto found_ptype; } rcu_read_unlock(); goto normal; found_ptype: skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; /* Only support TCP and non DODGY users. */ if (!skb_is_gso_tcp(skb) || (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; break; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); rcu_read_unlock(); if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); gro_list->count--; } if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush) goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) gro_flush_oldest(napi, &gro_list->list); else gro_list->count++; /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) __set_bit(bucket, &napi->gro_bitmask); } else if (test_bit(bucket, &napi->gro_bitmask)) { __clear_bit(bucket, &napi->gro_bitmask); } return ret; normal: ret = GRO_NORMAL; gro_try_pull_from_frag0(skb); goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) { struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, SKB_CONSUMED); break; case GRO_HELD: case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { gro_result_t ret; skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); nf_reset_ct(skb); skb->slow_gro = 0; } napi->skb = skb; } struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); if (skb) { napi->skb = skb; skb_mark_napi_id(skb, napi); } } return skb; } EXPORT_SYMBOL(napi_get_frags); static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else napi_reuse_skb(napi, skb); break; case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } /* Upper GRO stack assumes network header starts at gro_offset=0 * Drivers could call both napi_gro_frags() and napi_gro_receive() * We copy ethernet header into skb->data to have a common layout. */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; const struct ethhdr *eth; unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); if (unlikely(!skb_gro_may_pull(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } } else { eth = (const struct ethhdr *)skb->data; if (NAPI_GRO_CB(skb)->frag0 != skb->data) gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require * special handling. * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; return skb; } gro_result_t napi_gro_frags(struct napi_struct *napi) { gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_frags_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_frags); /* Compute the checksum from gro_offset and return the folded value * after adding in any pseudo checksum. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) { __wsum wsum; __sum16 sum; wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; NAPI_GRO_CB(skb)->csum_valid = 1; return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete);
1156 1149 1076 187 184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/spinlock.h> #include <linux/atomic.h> /* * This is an implementation of the notion of "decrement a * reference count, and return locked if it decremented to zero". * * NOTE NOTE NOTE! This is _not_ equivalent to * * if (atomic_dec_and_test(&atomic)) { * spin_lock(&lock); * return 1; * } * return 0; * * because the spin-lock and the decrement must be * "atomic". */ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);
15 2 1 1 15 14 10 2 9 15 12 13 12 18 7 12 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 // SPDX-License-Identifier: GPL-2.0-only /* * GHASH: hash function for GCM (Galois/Counter Mode). * * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> * Copyright (c) 2009 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ /* * GHASH is a keyed hash function used in GCM authentication tag generation. * * The original GCM paper [1] presents GHASH as a function GHASH(H, A, C) which * takes a 16-byte hash key H, additional authenticated data A, and a ciphertext * C. It formats A and C into a single byte string X, interprets X as a * polynomial over GF(2^128), and evaluates this polynomial at the point H. * * However, the NIST standard for GCM [2] presents GHASH as GHASH(H, X) where X * is the already-formatted byte string containing both A and C. * * "ghash" in the Linux crypto API uses the 'X' (pre-formatted) convention, * since the API supports only a single data stream per hash. Thus, the * formatting of 'A' and 'C' is done in the "gcm" template, not in "ghash". * * The reason "ghash" is separate from "gcm" is to allow "gcm" to use an * accelerated "ghash" when a standalone accelerated "gcm(aes)" is unavailable. * It is generally inappropriate to use "ghash" for other purposes, since it is * an "ε-almost-XOR-universal hash function", not a cryptographic hash function. * It can only be used securely in crypto modes specially designed to use it. * * [1] The Galois/Counter Mode of Operation (GCM) * (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.694.695&rep=rep1&type=pdf) * [2] Recommendation for Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC * (https://csrc.nist.gov/publications/detail/sp/800-38d/final) */ #include <crypto/algapi.h> #include <crypto/gf128mul.h> #include <crypto/ghash.h> #include <crypto/internal/hash.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); memset(dctx, 0, sizeof(*dctx)); return 0; } static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); be128 k; if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; if (ctx->gf128) gf128mul_free_4k(ctx->gf128); BUILD_BUG_ON(sizeof(k) != GHASH_BLOCK_SIZE); memcpy(&k, key, GHASH_BLOCK_SIZE); /* avoid violating alignment rules */ ctx->gf128 = gf128mul_init_4k_lle(&k); memzero_explicit(&k, GHASH_BLOCK_SIZE); if (!ctx->gf128) return -ENOMEM; return 0; } static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; if (dctx->bytes) { int n = min(srclen, dctx->bytes); u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); dctx->bytes -= n; srclen -= n; while (n--) *pos++ ^= *src++; if (!dctx->bytes) gf128mul_4k_lle((be128 *)dst, ctx->gf128); } while (srclen >= GHASH_BLOCK_SIZE) { crypto_xor(dst, src, GHASH_BLOCK_SIZE); gf128mul_4k_lle((be128 *)dst, ctx->gf128); src += GHASH_BLOCK_SIZE; srclen -= GHASH_BLOCK_SIZE; } if (srclen) { dctx->bytes = GHASH_BLOCK_SIZE - srclen; while (srclen--) *dst++ ^= *src++; } return 0; } static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) { u8 *dst = dctx->buffer; if (dctx->bytes) { u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); while (dctx->bytes--) *tmp++ ^= 0; gf128mul_4k_lle((be128 *)dst, ctx->gf128); } dctx->bytes = 0; } static int ghash_final(struct shash_desc *desc, u8 *dst) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; ghash_flush(ctx, dctx); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; } static void ghash_exit_tfm(struct crypto_tfm *tfm) { struct ghash_ctx *ctx = crypto_tfm_ctx(tfm); if (ctx->gf128) gf128mul_free_4k(ctx->gf128); } static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, .final = ghash_final, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-generic", .cra_priority = 100, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, .cra_exit = ghash_exit_tfm, }, }; static int __init ghash_mod_init(void) { return crypto_register_shash(&ghash_alg); } static void __exit ghash_mod_exit(void) { crypto_unregister_shash(&ghash_alg); } subsys_initcall(ghash_mod_init); module_exit(ghash_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH hash function"); MODULE_ALIAS_CRYPTO("ghash"); MODULE_ALIAS_CRYPTO("ghash-generic");
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. It is mainly used for debugging and * statistics. * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> * * Fixes: * Alan Cox : UDP sockets show the rxqueue/txqueue * using hint flag for the netinfo. * Pauline Middelink : identd support * Alan Cox : Make /proc safer. * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file */ #include <linux/types.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/sock.h> #include <net/raw.h> #define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] */ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int orphans, sockets; orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1, sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv4.fqdir)); return 0; } /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; static const struct { const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, { "TimeExcds", ICMP_TIME_EXCEEDED }, { "ParmProbs", ICMP_PARAMETERPROB }, { "SrcQuenchs", ICMP_SOURCE_QUENCH }, { "Redirects", ICMP_REDIRECT }, { "Echos", ICMP_ECHO }, { "EchoReps", ICMP_ECHOREPLY }, { "Timestamps", ICMP_TIMESTAMP }, { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, { NULL, 0 } }; static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED), SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, unsigned short *type, int count) { int j; if (count) { seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } } static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 int i, count; unsigned short type[PERLINE]; unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; } if (count == PERLINE) { icmpmsg_put_line(seq, vals, type, count); count = 0; } } icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { struct net *net = seq->private; u64 buff64[IPSTATS_MIB_MAX]; int i; memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %llu", buff64[i]); return 0; } static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch(buff, snmp4_tcp_list, net->mib.tcp_statistics); for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); else seq_printf(seq, " %lu", buff[i]); } memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udplite_statistics); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } static int snmp_seq_show(struct seq_file *seq, void *v) { snmp_seq_show_ipstats(seq, v); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); snmp_seq_show_tcp_udp(seq, v); return 0; } /* * Output /proc/net/netstat */ static int netstat_seq_show(struct seq_file *seq, void *v) { const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; unsigned long *buff; int i; seq_puts(seq, "TcpExt:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { snmp_get_cpu_field_batch(buff, snmp4_net_list, net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); } seq_puts(seq, "\nIpExt:"); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); if (buff) { u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", buff64[i]); } else { for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); } kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; } static __net_init int ip_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat", 0444, net->proc_net, sockstat_seq_show, NULL)) goto out_sockstat; if (!proc_create_net_single("netstat", 0444, net->proc_net, netstat_seq_show, NULL)) goto out_netstat; if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show, NULL)) goto out_snmp; return 0; out_snmp: remove_proc_entry("netstat", net->proc_net); out_netstat: remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { remove_proc_entry("snmp", net->proc_net); remove_proc_entry("netstat", net->proc_net); remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, }; int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include <linux/tracepoint.h> struct irqaction; struct softirq_action; #define SOFTIRQ_NAME_LIST \ softirq_name(HI) \ softirq_name(TIMER) \ softirq_name(NET_TX) \ softirq_name(NET_RX) \ softirq_name(BLOCK) \ softirq_name(IRQ_POLL) \ softirq_name(TASKLET) \ softirq_name(SCHED) \ softirq_name(HRTIMER) \ softirq_name_end(RCU) #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); #define softirq_name_end(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); SOFTIRQ_NAME_LIST #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }, #define softirq_name_end(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, SOFTIRQ_NAME_LIST) /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number * @action: pointer to struct irqaction * * The struct irqaction pointed to by @action contains various * information about the handler, including the device name, * @action->name, and the device id, @action->dev_id. When used in * conjunction with the irq_handler_exit tracepoint, we can figure * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, TP_PROTO(int irq, struct irqaction *action), TP_ARGS(irq, action), TP_STRUCT__entry( __field( int, irq ) __string( name, action->name ) ), TP_fast_assign( __entry->irq = irq; __assign_str(name, action->name); ), TP_printk("irq=%d name=%s", __entry->irq, __get_str(name)) ); /** * irq_handler_exit - called immediately after the irq action handler returns * @irq: irq number * @action: pointer to struct irqaction * @ret: return value * * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding * @action->handler successfully handled this irq. Otherwise, the irq might be * a shared irq line, or the irq was not handled successfully. Can be used in * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, TP_PROTO(int irq, struct irqaction *action, int ret), TP_ARGS(irq, action, ret), TP_STRUCT__entry( __field( int, irq ) __field( int, ret ) ), TP_fast_assign( __entry->irq = irq; __entry->ret = ret; ), TP_printk("irq=%d ret=%s", __entry->irq, __entry->ret ? "handled" : "unhandled") ); DECLARE_EVENT_CLASS(softirq, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr), TP_STRUCT__entry( __field( unsigned int, vec ) ), TP_fast_assign( __entry->vec = vec_nr; ), TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler * @vec_nr: softirq vector number * * When used in combination with the softirq_exit tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_entry, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_exit, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); DECLARE_EVENT_CLASS(tasklet, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func), TP_STRUCT__entry( __field( void *, tasklet) __field( void *, func) ), TP_fast_assign( __entry->tasklet = t; __entry->func = func; ), TP_printk("tasklet=%ps function=%ps", __entry->tasklet, __entry->func) ); /** * tasklet_entry - called immediately before the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_entry, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); /** * tasklet_exit - called immediately after the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_exit, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); /* Type specific function prefix */ #define HTYPE hash_netport #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED /* IPv4 variant */ /* Member elements */ struct hash_netport4_elem { __be32 ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; }; /* Common functions */ static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_netport4_data_list(struct sk_buff *skb, const struct hash_netport4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_netport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = port_to = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port_to < port) swap(port, port_to); } if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); } else { p = port; } do { e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; for (; p <= port_to; p++, i++) { e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_netport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } p = port; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_netport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; }; /* Common functions */ static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); elem->cidr = cidr - 1; } static bool hash_netport6_data_list(struct sk_buff *skb, const struct hash_netport6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_netport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6_netmask(&e.ip, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_netport_type __read_mostly = { .name = "hash:net,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_netport_init(void) { return ip_set_type_register(&hash_netport_type); } static void __exit hash_netport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } module_init(hash_netport_init); module_exit(hash_netport_fini);
3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct rss_req_info { struct ethnl_req_info base; u32 rss_context; }; struct rss_reply_data { struct ethnl_reply_data base; u32 indir_size; u32 hkey_size; u32 hfunc; u32 input_xfrm; u32 *indir_table; u8 *hkey; }; #define RSS_REQINFO(__req_base) \ container_of(__req_base, struct rss_req_info, base) #define RSS_REPDATA(__reply_base) \ container_of(__reply_base, struct rss_reply_data, base) const struct nla_policy ethnl_rss_get_policy[] = { [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 }, }; static int rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rss_req_info *request = RSS_REQINFO(req_info); if (tb[ETHTOOL_A_RSS_CONTEXT]) request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); return 0; } static int rss_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); struct net_device *dev = reply_base->dev; struct ethtool_rxfh_param rxfh = {}; const struct ethtool_ops *ops; u32 total_size, indir_bytes; u8 *rss_config; int ret; ops = dev->ethtool_ops; if (!ops->get_rxfh) return -EOPNOTSUPP; /* Some drivers don't handle rss_context */ if (request->rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->indir_size = 0; data->hkey_size = 0; if (ops->get_rxfh_indir_size) data->indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) data->hkey_size = ops->get_rxfh_key_size(dev); indir_bytes = data->indir_size * sizeof(u32); total_size = indir_bytes + data->hkey_size; rss_config = kzalloc(total_size, GFP_KERNEL); if (!rss_config) { ret = -ENOMEM; goto out_ops; } if (data->indir_size) data->indir_table = (u32 *)rss_config; if (data->hkey_size) data->hkey = rss_config + indir_bytes; rxfh.indir_size = data->indir_size; rxfh.indir = data->indir_table; rxfh.key_size = data->hkey_size; rxfh.key = data->hkey; rxfh.rss_context = request->rss_context; ret = ops->get_rxfh(dev, &rxfh); if (ret) goto out_ops; data->hfunc = rxfh.hfunc; data->input_xfrm = rxfh.input_xfrm; out_ops: ethnl_ops_complete(dev); return ret; } static int rss_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); int len; len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ return len; } static int rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || (data->input_xfrm && nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || (data->indir_size && nla_put(skb, ETHTOOL_A_RSS_INDIR, sizeof(u32) * data->indir_size, data->indir_table)) || (data->hkey_size && nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) return -EMSGSIZE; return 0; } static void rss_cleanup_data(struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); kfree(data->indir_table); } const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, .hdr_attr = ETHTOOL_A_RSS_HEADER, .req_info_size = sizeof(struct rss_req_info), .reply_data_size = sizeof(struct rss_reply_data), .parse_request = rss_parse_request, .prepare_data = rss_prepare_data, .reply_size = rss_reply_size, .fill_reply = rss_fill_reply, .cleanup_data = rss_cleanup_data, };
1079 1095 16 43 35 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 // SPDX-License-Identifier: GPL-2.0-only /* net/atm/clip.c - RFC1577 Classical IP over ATM */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> /* for UINT_MAX */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/timer.h> #include <linux/if_arp.h> /* for some manifest constants */ #include <linux/notifier.h> #include <linux/atm.h> #include <linux/atmdev.h> #include <linux/atmclip.h> #include <linux/atmarp.h> #include <linux/capability.h> #include <linux/ip.h> /* for net/route.h */ #include <linux/in.h> /* for struct sockaddr_in */ #include <linux/if.h> /* for IFF_UP */ #include <linux/inetdevice.h> #include <linux/bitops.h> #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/route.h> /* for struct rtable and routing */ #include <net/icmp.h> /* icmp_send */ #include <net/arp.h> #include <linux/param.h> /* for HZ */ #include <linux/uaccess.h> #include <asm/byteorder.h> /* for htons etc. */ #include <linux/atomic.h> #include "common.h" #include "resources.h" #include <net/atmclip.h> static struct net_device *clip_devs; static struct atm_vcc *atmarpd; static struct timer_list idle_timer; static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { struct sock *sk; struct atmarp_ctrl *ctrl; struct sk_buff *skb; pr_debug("(%d)\n", type); if (!atmarpd) return -EUNATCH; skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); if (!skb) return -ENOMEM; ctrl = skb_put(skb, sizeof(struct atmarp_ctrl)); ctrl->type = type; ctrl->itf_num = itf; ctrl->ip = ip; atm_force_charge(atmarpd, skb->truesize); sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return 0; } static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) { pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh); clip_vcc->entry = entry; clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ clip_vcc->next = entry->vccs; entry->vccs = clip_vcc; entry->neigh->used = jiffies; } static void unlink_clip_vcc(struct clip_vcc *clip_vcc) { struct atmarp_entry *entry = clip_vcc->entry; struct clip_vcc **walk; if (!entry) { pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { int error; *walk = clip_vcc->next; /* atomic */ clip_vcc->entry = NULL; if (clip_vcc->xoff) netif_wake_queue(entry->neigh->dev); if (entry->vccs) goto out; entry->expires = jiffies - 1; /* force resolution or expiration */ error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) pr_err("neigh_update failed with %d\n", error); goto out; } pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ static int neigh_check_cb(struct neighbour *n) { struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; if (n->ops != &clip_neigh_ops) return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; if (cv->idle_timeout && time_after(jiffies, exp)) { pr_debug("releasing vcc %p->%p of entry %p\n", cv, cv->vcc, entry); vcc_release_async(cv->vcc, -ETIMEDOUT); } } if (entry->vccs || time_before(jiffies, entry->expires)) return 0; if (refcount_read(&n->refcnt) > 1) { struct sk_buff *skb; pr_debug("destruction postponed with ref %d\n", refcount_read(&n->refcnt)); while ((skb = skb_dequeue(&n->arp_queue)) != NULL) dev_kfree_skb(skb); return 0; } pr_debug("expired neigh %p\n", n); return 1; } static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) { struct atm_vcc *vcc; pr_debug("\n"); vcc = ATM_SKB(skb)->vcc; if (!vcc || !atm_charge(vcc, skb->truesize)) { dev_kfree_skb_any(skb); return 0; } pr_debug("pushing to %p\n", vcc); pr_debug("using %p\n", CLIP_VCC(vcc)->old_push); CLIP_VCC(vcc)->old_push(vcc, skb); return 0; } static const unsigned char llc_oui[] = { 0xaa, /* DSAP: non-ISO */ 0xaa, /* SSAP: non-ISO */ 0x03, /* Ctrl: Unnumbered Information Command PDU */ 0x00, /* OUI: EtherType */ 0x00, 0x00 }; static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); pr_debug("\n"); if (!clip_devs) { atm_return(vcc, skb->truesize); kfree_skb(skb); return; } if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) unlink_clip_vcc(clip_vcc); clip_vcc->old_push(vcc, NULL); /* pass on the bad news */ kfree(clip_vcc); return; } atm_return(vcc, skb->truesize); skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { dev_kfree_skb_any(skb); return; } ATM_SKB(skb)->vcc = vcc; skb_reset_mac_header(skb); if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, llc_oui, sizeof(llc_oui))) skb->protocol = htons(ETH_P_IP); else { skb->protocol = ((__be16 *)skb->data)[3]; skb_pull(skb, RFC1483LLC_LEN); if (skb->protocol == htons(ETH_P_ARP)) { skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; clip_arp_rcv(skb); return; } } clip_vcc->last_use = jiffies; skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } /* * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that * clip_pop is atomic with respect to the critical section in clip_start_xmit. */ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); struct net_device *dev = skb->dev; int old; unsigned long flags; pr_debug("(vcc %p)\n", vcc); clip_vcc->old_pop(vcc, skb); /* skb->dev == NULL in outbound ARP packets */ if (!dev) return; spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags); if (atm_may_send(vcc, 0)) { old = xchg(&clip_vcc->xoff, 0); if (old) netif_wake_queue(dev); } spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); } static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 *ip = (__be32 *) neigh->primary_key; pr_debug("(neigh %p, skb %p)\n", neigh, skb); to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip); } static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) { #ifndef CONFIG_ATM_CLIP_NO_ICMP icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); #endif kfree_skb(skb); } static const struct neigh_ops clip_neigh_ops = { .family = AF_INET, .solicit = clip_neigh_solicit, .error_report = clip_neigh_error, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); if (neigh->tbl->family != AF_INET) return -EINVAL; if (neigh->type != RTN_UNICAST) return -EINVAL; neigh->nud_state = NUD_NONE; neigh->ops = &clip_neigh_ops; neigh->output = neigh->ops->output; entry->neigh = neigh; entry->vccs = NULL; entry->expires = jiffies - 1; return 0; } /* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ /* * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, * don't increment the usage count. This is used to create entries in * clip_setentry. */ static int clip_encap(struct atm_vcc *vcc, int mode) { if (!CLIP_VCC(vcc)) return -EBADFD; CLIP_VCC(vcc)->encap = mode; return 0; } static netdev_tx_t clip_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct clip_priv *clip_priv = PRIV(dev); struct dst_entry *dst = skb_dst(skb); struct atmarp_entry *entry; struct neighbour *n; struct atm_vcc *vcc; struct rtable *rt; __be32 *daddr; int old; unsigned long flags; pr_debug("(skb %p)\n", skb); if (!dst) { pr_err("skb_dst(skb) == NULL\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } rt = (struct rtable *) dst; if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); if (!n) { pr_err("NO NEIGHBOUR !\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } entry = neighbour_priv(n); if (!entry->vccs) { if (time_after(jiffies, entry->expires)) { /* should be resolved */ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key)); } if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) skb_queue_tail(&entry->neigh->arp_queue, skb); else { dev_kfree_skb(skb); dev->stats.tx_dropped++; } goto out_release_neigh; } pr_debug("neigh %p, vccs %p\n", entry, entry->vccs); ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; pr_debug("using neighbour %p, vcc %p\n", n, vcc); if (entry->vccs->encap) { void *here; here = skb_push(skb, RFC1483LLC_LEN); memcpy(here, llc_oui, sizeof(llc_oui)); ((__be16 *) here)[3] = skb->protocol; } atm_account_tx(vcc, skb); entry->vccs->last_use = jiffies; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; vcc->send(vcc, skb); if (atm_may_send(vcc, 0)) { entry->vccs->xoff = 0; goto out_release_neigh; } spin_lock_irqsave(&clip_priv->xoff_lock, flags); netif_stop_queue(dev); /* XOFF -> throttle immediately */ barrier(); if (!entry->vccs->xoff) netif_start_queue(dev); /* Oh, we just raced with clip_pop. netif_start_queue should be good enough, because nothing should really be asleep because of the brief netif_stop_queue. If this isn't true or if it changes, use netif_wake_queue instead. */ spin_unlock_irqrestore(&clip_priv->xoff_lock, flags); out_release_neigh: neigh_release(n); return NETDEV_TX_OK; } static int clip_mkip(struct atm_vcc *vcc, int timeout) { struct clip_vcc *clip_vcc; if (!vcc->push) return -EBADFD; clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); if (!clip_vcc) return -ENOMEM; pr_debug("%p vcc %p\n", clip_vcc, vcc); clip_vcc->vcc = vcc; vcc->user_back = clip_vcc; set_bit(ATM_VF_IS_CLIP, &vcc->flags); clip_vcc->entry = NULL; clip_vcc->xoff = 0; clip_vcc->encap = 1; clip_vcc->last_use = jiffies; clip_vcc->idle_timeout = timeout * HZ; clip_vcc->old_push = vcc->push; clip_vcc->old_pop = vcc->pop; vcc->push = clip_push; vcc->pop = clip_pop; /* re-process everything received between connection setup and MKIP */ vcc_process_recv_queue(vcc); return 0; } static int clip_setentry(struct atm_vcc *vcc, __be32 ip) { struct neighbour *neigh; struct atmarp_entry *entry; int error; struct clip_vcc *clip_vcc; struct rtable *rt; if (vcc->push != clip_push) { pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); if (!ip) { if (!clip_vcc->entry) { pr_err("hiding hidden ATMARP entry\n"); return 0; } pr_debug("remove\n"); unlink_clip_vcc(clip_vcc); return 0; } rt = ip_route_output(&init_net, ip, 0, 1, 0); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; entry = neighbour_priv(neigh); if (entry != clip_vcc->entry) { if (!clip_vcc->entry) pr_debug("add\n"); else { pr_debug("update\n"); unlink_clip_vcc(clip_vcc); } link_vcc(clip_vcc, entry); } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); return error; } static const struct net_device_ops clip_netdev_ops = { .ndo_start_xmit = clip_start_xmit, .ndo_neigh_construct = clip_constructor, }; static void clip_setup(struct net_device *dev) { dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->neigh_priv_len = sizeof(struct atmarp_entry); dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; dev->tx_queue_len = 100; /* "normal" queue (packets) */ /* When using a "real" qdisc, the qdisc determines the queue */ /* length. tx_queue_len is only used for the default case, */ /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ netif_keep_dst(dev); } static int clip_create(int number) { struct net_device *dev; struct clip_priv *clip_priv; int error; if (number != -1) { for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number == number) return -EEXIST; } else { number = 0; for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); sprintf(dev->name, "atm%d", number); spin_lock_init(&clip_priv->xoff_lock); clip_priv->number = number; error = register_netdev(dev); if (error) { free_netdev(dev); return error; } clip_priv->next = clip_devs; clip_devs = dev; pr_debug("registered (net:%s)\n", dev->name); return number; } static int clip_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) return NOTIFY_DONE; /* ignore non-CLIP devices */ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { case NETDEV_UP: pr_debug("NETDEV_UP\n"); to_atmarpd(act_up, PRIV(dev)->number, 0); break; case NETDEV_GOING_DOWN: pr_debug("NETDEV_DOWN\n"); to_atmarpd(act_down, PRIV(dev)->number, 0); break; case NETDEV_CHANGE: case NETDEV_CHANGEMTU: pr_debug("NETDEV_CHANGE*\n"); to_atmarpd(act_change, PRIV(dev)->number, 0); break; } return NOTIFY_DONE; } static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* * Transitions are of the down-change-up type, so it's sufficient to * handle the change on up. */ if (event != NETDEV_UP) return NOTIFY_DONE; netdev_notifier_info_init(&info, in_dev->dev); return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { .notifier_call = clip_device_event, }; static struct notifier_block clip_inet_notifier = { .notifier_call = clip_inet_event, }; static void atmarpd_close(struct atm_vcc *vcc) { pr_debug("\n"); rtnl_lock(); atmarpd = NULL; skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); rtnl_unlock(); pr_debug("(done)\n"); module_put(THIS_MODULE); } static const struct atmdev_ops atmarpd_dev_ops = { .close = atmarpd_close }; static struct atm_dev atmarpd_dev = { .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; static int atm_init_atmarp(struct atm_vcc *vcc) { rtnl_lock(); if (atmarpd) { rtnl_unlock(); return -EADDRINUSE; } mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); atmarpd = vcc; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* allow replies and avoid getting closed if signaling dies */ vcc->dev = &atmarpd_dev; vcc_insert_socket(sk_atm(vcc)); vcc->push = NULL; vcc->pop = NULL; /* crash */ vcc->push_oam = NULL; /* crash */ rtnl_unlock(); return 0; } static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case SIOCMKCLIP: case ATMARPD_CTRL: case ATMARP_MKIP: case ATMARP_SETENTRY: case ATMARP_ENCAP: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } switch (cmd) { case SIOCMKCLIP: err = clip_create(arg); break; case ATMARPD_CTRL: err = atm_init_atmarp(vcc); if (!err) { sock->state = SS_CONNECTED; __module_get(THIS_MODULE); } break; case ATMARP_MKIP: err = clip_mkip(vcc, arg); break; case ATMARP_SETENTRY: err = clip_setentry(vcc, (__force __be32)arg); break; case ATMARP_ENCAP: err = clip_encap(vcc, arg); break; } return err; } static struct atm_ioctl clip_ioctl_ops = { .owner = THIS_MODULE, .ioctl = clip_ioctl, }; #ifdef CONFIG_PROC_FS static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) { static int code[] = { 1, 2, 10, 6, 1, 0 }; static int e164[] = { 1, 8, 4, 6, 1, 0 }; if (*addr->sas_addr.pub) { seq_printf(seq, "%s", addr->sas_addr.pub); if (*addr->sas_addr.prv) seq_putc(seq, '+'); } else if (!*addr->sas_addr.prv) { seq_printf(seq, "%s", "(none)"); return; } if (*addr->sas_addr.prv) { unsigned char *prv = addr->sas_addr.prv; int *fields; int i, j; fields = *prv == ATM_AFI_E164 ? e164 : code; for (i = 0; fields[i]; i++) { for (j = fields[i]; j; j--) seq_printf(seq, "%02X", *prv++); if (fields[i + 1]) seq_putc(seq, '.'); } } } /* This means the neighbour entry has no attached VCC objects. */ #define SEQ_NO_VCC_TOKEN ((void *) 2) static void atmarp_info(struct seq_file *seq, struct neighbour *n, struct atmarp_entry *entry, struct clip_vcc *clip_vcc) { struct net_device *dev = n->dev; unsigned long exp; char buf[17]; int svc, llc, off; svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap); if (clip_vcc == SEQ_NO_VCC_TOKEN) exp = entry->neigh->used; else exp = clip_vcc->last_use; exp = (jiffies - exp) / HZ; seq_printf(seq, "%-6s%-4s%-4s%5ld ", dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key); while (off < 16) buf[off++] = ' '; buf[off] = '\0'; seq_printf(seq, "%s", buf); if (clip_vcc == SEQ_NO_VCC_TOKEN) { if (time_before(jiffies, entry->expires)) seq_printf(seq, "(resolving)\n"); else seq_printf(seq, "(expired, ref %d)\n", refcount_read(&entry->neigh->refcnt)); } else if (!svc) { seq_printf(seq, "%d.%d.%d\n", clip_vcc->vcc->dev->number, clip_vcc->vcc->vpi, clip_vcc->vcc->vci); } else { svc_addr(seq, &clip_vcc->vcc->remote); seq_putc(seq, '\n'); } } struct clip_seq_state { /* This member must be first. */ struct neigh_seq_state ns; /* Local to clip specific iteration. */ struct clip_vcc *vcc; }; static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, struct clip_vcc *curr) { if (!curr) { curr = e->vccs; if (!curr) return SEQ_NO_VCC_TOKEN; return curr; } if (curr == SEQ_NO_VCC_TOKEN) return NULL; curr = curr->next; return curr; } static void *clip_seq_vcc_walk(struct clip_seq_state *state, struct atmarp_entry *e, loff_t * pos) { struct clip_vcc *vcc = state->vcc; vcc = clip_seq_next_vcc(e, vcc); if (vcc && pos != NULL) { while (*pos) { vcc = clip_seq_next_vcc(e, vcc); if (!vcc) break; --(*pos); } } state->vcc = vcc; return vcc; } static void *clip_seq_sub_iter(struct neigh_seq_state *_state, struct neighbour *n, loff_t * pos) { struct clip_seq_state *state = (struct clip_seq_state *)_state; if (n->dev->type != ARPHRD_ATM) return NULL; return clip_seq_vcc_walk(state, neighbour_priv(n), pos); } static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { struct clip_seq_state *state = seq->private; state->ns.neigh_sub_iter = clip_seq_sub_iter; return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY); } static int clip_seq_show(struct seq_file *seq, void *v) { static char atm_arp_banner[] = "IPitf TypeEncp Idle IP address ATM address\n"; if (v == SEQ_START_TOKEN) { seq_puts(seq, atm_arp_banner); } else { struct clip_seq_state *state = seq->private; struct clip_vcc *vcc = state->vcc; struct neighbour *n = v; atmarp_info(seq, n, neighbour_priv(n), vcc); } return 0; } static const struct seq_operations arp_seq_ops = { .start = clip_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = clip_seq_show, }; #endif static void atm_clip_exit_noproc(void); static int __init atm_clip_init(void) { register_atm_ioctl(&clip_ioctl_ops); register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { struct proc_dir_entry *p; p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops, sizeof(struct clip_seq_state)); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); return -ENOMEM; } } #endif return 0; } static void atm_clip_exit_noproc(void) { struct net_device *dev, *next; unregister_inetaddr_notifier(&clip_inet_notifier); unregister_netdevice_notifier(&clip_dev_notifier); deregister_atm_ioctl(&clip_ioctl_ops); /* First, stop the idle timer, so it stops banging * on the table. */ del_timer_sync(&idle_timer); dev = clip_devs; while (dev) { next = PRIV(dev)->next; unregister_netdev(dev); free_netdev(dev); dev = next; } } static void __exit atm_clip_exit(void) { remove_proc_entry("arp", atm_proc_root); atm_clip_exit_noproc(); } module_init(atm_clip_init); module_exit(atm_clip_exit); MODULE_AUTHOR("Werner Almesberger"); MODULE_DESCRIPTION("Classical/IP over ATM interface"); MODULE_LICENSE("GPL");
32942 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */
14513 14500 14519 14499 14901 14509 14481 14512 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { kmemleak_alloc(addr, size, 1, flags); return addr; } addr = vzalloc_node(size, nid); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { if (is_vmalloc_addr(addr)) { vfree(addr); } else { struct page *page = virt_to_page(addr); size_t table_size; table_size = page_ext_size * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); }
253 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 // SPDX-License-Identifier: GPL-2.0 /* * This is a maximally equidistributed combined Tausworthe generator * based on code from GNU Scientific Library 1.5 (30 Jun 2004) * * lfsr113 version: * * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) * * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) * * The period of this generator is about 2^113 (see erratum paper). * * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps * * There is an erratum in the paper "Tables of Maximally Equidistributed * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps * * ... the k_j most significant bits of z_j must be non-zero, * for each j. (Note: this restriction also applies to the * computer code given in [4], but was mistakenly not mentioned * in that paper.) * * This affects the seeding procedure by imposing the requirement * s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ #include <linux/types.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/slab.h> #include <asm/unaligned.h> /** * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); /** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { u8 *ptr = buf; while (bytes >= sizeof(u32)) { put_unaligned(prandom_u32_state(state), (u32 *) ptr); ptr += sizeof(u32); bytes -= sizeof(u32); } if (bytes > 0) { u32 rem = prandom_u32_state(state); do { *ptr++ = (u8) rem; bytes--; rem >>= BITS_PER_BYTE; } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); static void prandom_warmup(struct rnd_state *state) { /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); } void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state) { int i; for_each_possible_cpu(i) { struct rnd_state *state = per_cpu_ptr(pcpu_state, i); u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); state->s1 = __seed(seeds[0], 2U); state->s2 = __seed(seeds[1], 8U); state->s3 = __seed(seeds[2], 16U); state->s4 = __seed(seeds[3], 128U); prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed_full_state); #ifdef CONFIG_RANDOM32_SELFTEST static struct prandom_test1 { u32 seed; u32 result; } test1[] = { { 1U, 3484351685U }, { 2U, 2623130059U }, { 3U, 3125133893U }, { 4U, 984847254U }, }; static struct prandom_test2 { u32 seed; u32 iteration; u32 result; } test2[] = { /* Test cases against taus113 from GSL library. */ { 931557656U, 959U, 2975593782U }, { 1339693295U, 876U, 3887776532U }, { 1545556285U, 961U, 1615538833U }, { 601730776U, 723U, 1776162651U }, { 1027516047U, 687U, 511983079U }, { 416526298U, 700U, 916156552U }, { 1395522032U, 652U, 2222063676U }, { 366221443U, 617U, 2992857763U }, { 1539836965U, 714U, 3783265725U }, { 556206671U, 994U, 799626459U }, { 684907218U, 799U, 367789491U }, { 2121230701U, 931U, 2115467001U }, { 1668516451U, 644U, 3620590685U }, { 768046066U, 883U, 2034077390U }, { 1989159136U, 833U, 1195767305U }, { 536585145U, 996U, 3577259204U }, { 1008129373U, 642U, 1478080776U }, { 1740775604U, 939U, 1264980372U }, { 1967883163U, 508U, 10734624U }, { 1923019697U, 730U, 3821419629U }, { 442079932U, 560U, 3440032343U }, { 1961302714U, 845U, 841962572U }, { 2030205964U, 962U, 1325144227U }, { 1160407529U, 507U, 240940858U }, { 635482502U, 779U, 4200489746U }, { 1252788931U, 699U, 867195434U }, { 1961817131U, 719U, 668237657U }, { 1071468216U, 983U, 917876630U }, { 1281848367U, 932U, 1003100039U }, { 582537119U, 780U, 1127273778U }, { 1973672777U, 853U, 1071368872U }, { 1896756996U, 762U, 1127851055U }, { 847917054U, 500U, 1717499075U }, { 1240520510U, 951U, 2849576657U }, { 1685071682U, 567U, 1961810396U }, { 1516232129U, 557U, 3173877U }, { 1208118903U, 612U, 1613145022U }, { 1817269927U, 693U, 4279122573U }, { 1510091701U, 717U, 638191229U }, { 365916850U, 807U, 600424314U }, { 399324359U, 702U, 1803598116U }, { 1318480274U, 779U, 2074237022U }, { 697758115U, 840U, 1483639402U }, { 1696507773U, 840U, 577415447U }, { 2081979121U, 981U, 3041486449U }, { 955646687U, 742U, 3846494357U }, { 1250683506U, 749U, 836419859U }, { 595003102U, 534U, 366794109U }, { 47485338U, 558U, 3521120834U }, { 619433479U, 610U, 3991783875U }, { 704096520U, 518U, 4139493852U }, { 1712224984U, 606U, 2393312003U }, { 1318233152U, 922U, 3880361134U }, { 855572992U, 761U, 1472974787U }, { 64721421U, 703U, 683860550U }, { 678931758U, 840U, 380616043U }, { 692711973U, 778U, 1382361947U }, { 677703619U, 530U, 2826914161U }, { 92393223U, 586U, 1522128471U }, { 1222592920U, 743U, 3466726667U }, { 358288986U, 695U, 1091956998U }, { 1935056945U, 958U, 514864477U }, { 735675993U, 990U, 1294239989U }, { 1560089402U, 897U, 2238551287U }, { 70616361U, 829U, 22483098U }, { 368234700U, 731U, 2913875084U }, { 20221190U, 879U, 1564152970U }, { 539444654U, 682U, 1835141259U }, { 1314987297U, 840U, 1801114136U }, { 2019295544U, 645U, 3286438930U }, { 469023838U, 716U, 1637918202U }, { 1843754496U, 653U, 2562092152U }, { 400672036U, 809U, 4264212785U }, { 404722249U, 965U, 2704116999U }, { 600702209U, 758U, 584979986U }, { 519953954U, 667U, 2574436237U }, { 1658071126U, 694U, 2214569490U }, { 420480037U, 749U, 3430010866U }, { 690103647U, 969U, 3700758083U }, { 1029424799U, 937U, 3787746841U }, { 2012608669U, 506U, 3362628973U }, { 1535432887U, 998U, 42610943U }, { 1330635533U, 857U, 3040806504U }, { 1223800550U, 539U, 3954229517U }, { 1322411537U, 680U, 3223250324U }, { 1877847898U, 945U, 2915147143U }, { 1646356099U, 874U, 965988280U }, { 805687536U, 744U, 4032277920U }, { 1948093210U, 633U, 1346597684U }, { 392609744U, 783U, 1636083295U }, { 690241304U, 770U, 1201031298U }, { 1360302965U, 696U, 1665394461U }, { 1220090946U, 780U, 1316922812U }, { 447092251U, 500U, 3438743375U }, { 1613868791U, 592U, 828546883U }, { 523430951U, 548U, 2552392304U }, { 726692899U, 810U, 1656872867U }, { 1364340021U, 836U, 3710513486U }, { 1986257729U, 931U, 935013962U }, { 407983964U, 921U, 728767059U }, }; static void prandom_state_selftest_seed(struct rnd_state *state, u32 seed) { #define LCG(x) ((x) * 69069U) /* super-duper LCG */ state->s1 = __seed(LCG(seed), 2U); state->s2 = __seed(LCG(state->s1), 8U); state->s3 = __seed(LCG(state->s2), 16U); state->s4 = __seed(LCG(state->s3), 128U); } static int __init prandom_state_selftest(void) { int i, j, errors = 0, runs = 0; bool error = false; for (i = 0; i < ARRAY_SIZE(test1); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test1[i].seed); prandom_warmup(&state); if (test1[i].result != prandom_u32_state(&state)) error = true; } if (error) pr_warn("prandom: seed boundary self test failed\n"); else pr_info("prandom: seed boundary self test passed\n"); for (i = 0; i < ARRAY_SIZE(test2); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test2[i].seed); prandom_warmup(&state); for (j = 0; j < test2[i].iteration - 1; j++) prandom_u32_state(&state); if (test2[i].result != prandom_u32_state(&state)) errors++; runs++; cond_resched(); } if (errors) pr_warn("prandom: %d/%d self tests failed\n", errors, runs); else pr_info("prandom: %d self tests passed\n", runs); return 0; } core_initcall(prandom_state_selftest); #endif
26 2125 2131 82 82 11 2081 2051 2066 2092 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 // SPDX-License-Identifier: GPL-2.0-only /* * fs/anon_inodes.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * Thanks to Arnd Bergmann for code review and suggestions. * More changes for Thomas Gleixner suggestions. * */ #include <linux/cred.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/anon_inodes.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> static struct vfsmount *anon_inode_mnt __ro_after_init; static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; static int anon_inodefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->dops = &anon_inodefs_dentry_operations; return 0; } static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .init_fs_context = anon_inodefs_init_fs_context, .kill_sb = kill_anon_super, }; static struct inode *anon_inode_make_secure_inode( const char *name, const struct inode *context_inode) { struct inode *inode; const struct qstr qname = QSTR_INIT(name, strlen(name)); int error; inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; error = security_inode_init_security_anon(inode, &qname, context_inode); if (error) { iput(inode); return ERR_PTR(error); } return inode; } static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { struct inode *inode; struct file *file; if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); goto err; } } else { inode = anon_inode_inode; if (IS_ERR(inode)) { file = ERR_PTR(-ENODEV); goto err; } /* * We know the anon_inode inode count is always * greater than zero, so ihold() is safe. */ ihold(inode); } file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); if (IS_ERR(file)) goto err_iput; file->f_mapping = inode->i_mapping; file->private_data = priv; return file; err_iput: iput(inode); err: module_put(fops->owner); return file; } /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. Returns the newly created file* or an error pointer. */ struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfile(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfile); /** * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns the newly created file* or an error pointer. */ struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { int error, fd; struct file *file; error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; } fd_install(fd, file); return fd; err_put_unused_fd: put_unused_fd(fd); return error; } /** * anon_inode_getfd - creates a new file instance by hooking it up to * an anonymous inode and a dentry that describe * the "class" of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is * useful for files that do not need to have a full-fledged inode in * order to operate correctly. All the files created with * anon_inode_getfd() will use the same singleton inode, reducing * memory use and avoiding code duplication for the file/inode/dentry * setup. Returns a newly created file descriptor or an error code. */ int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfd(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfd); /** * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns a newly created file descriptor or an error code. */ int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } static int __init anon_inode_init(void) { anon_inode_mnt = kern_mount(&anon_inode_fs_type); if (IS_ERR(anon_inode_mnt)) panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(anon_inode_inode)) panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); return 0; } fs_initcall(anon_inode_init);
9 2 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ #include "hash.h" #include "main.h" #include <linux/gfp.h> #include <linux/lockdep.h> #include <linux/slab.h> /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) { u32 i; for (i = 0; i < hash->size; i++) { INIT_HLIST_HEAD(&hash->table[i]); spin_lock_init(&hash->list_locks[i]); } atomic_set(&hash->generation, 0); } /** * batadv_hash_destroy() - Free only the hashtable and the hash itself * @hash: hash object to destroy */ void batadv_hash_destroy(struct batadv_hashtable *hash) { kfree(hash->list_locks); kfree(hash->table); kfree(hash); } /** * batadv_hash_new() - Allocates and clears the hashtable * @size: number of hash buckets to allocate * * Return: newly allocated hashtable, NULL on errors */ struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; hash = kmalloc(sizeof(*hash), GFP_ATOMIC); if (!hash) return NULL; hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC); if (!hash->table) goto free_hash; hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks), GFP_ATOMIC); if (!hash->list_locks) goto free_table; hash->size = size; batadv_hash_init(hash); return hash; free_table: kfree(hash->table); free_hash: kfree(hash); return NULL; } /** * batadv_hash_set_lock_class() - Set specific lockdep class for hash spinlocks * @hash: hash object to modify * @key: lockdep class key address */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key) { u32 i; for (i = 0; i < hash->size; i++) lockdep_set_class(&hash->list_locks[i], key); }
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "irq.h" #include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "mmu/page_track.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" #include "hyperv.h" #include "lapic.h" #include "xen.h" #include "smm.h" #include <linux/clocksource.h> #include <linux/interrupt.h> #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> #include <linux/timekeeper_internal.h> #include <linux/pvclock_gtod.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> #include <linux/entry-kvm.h> #include <linux/suspend.h> #include <linux/smp.h> #include <trace/events/ipi.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> #include <asm/fpu/api.h> #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <asm/sgx.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 struct kvm_caps kvm_caps __read_mostly = { .supported_mce_cap = MCG_CTL_P | MCG_SER_P, }; EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 static u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, 0644); EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, 0444); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, 0644); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, 0444); bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); /* * Flags to manipulate forced emulation behavior (any non-zero value will * enable forced emulation). */ #define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) static int __read_mostly force_emulation_prefix; module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_GPL(enable_pmu); module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); /* Enable/disable SMT_RSB bug mitigation */ static bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU * returns to userspace, i.e. the kernel can run with the guest's value. */ #define KVM_MAX_NR_USER_RETURN_MSRS 16 struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; struct kvm_user_return_msr_values { u64 host; u64 curr; } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; u32 __read_mostly kvm_nr_uret_msrs; EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly host_arch_capabilities; EXPORT_SYMBOL_GPL(host_arch_capabilities); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), STATS_DESC_COUNTER(VM, mmu_pte_write), STATS_DESC_COUNTER(VM, mmu_pde_zapped), STATS_DESC_COUNTER(VM, mmu_flooded), STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), STATS_DESC_ICOUNTER(VM, pages_4k), STATS_DESC_ICOUNTER(VM, pages_2m), STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vm_stats_desc), }; const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, pf_taken), STATS_DESC_COUNTER(VCPU, pf_fixed), STATS_DESC_COUNTER(VCPU, pf_emulate), STATS_DESC_COUNTER(VCPU, pf_spurious), STATS_DESC_COUNTER(VCPU, pf_fast), STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), STATS_DESC_COUNTER(VCPU, pf_guest), STATS_DESC_COUNTER(VCPU, tlb_flush), STATS_DESC_COUNTER(VCPU, invlpg), STATS_DESC_COUNTER(VCPU, exits), STATS_DESC_COUNTER(VCPU, io_exits), STATS_DESC_COUNTER(VCPU, mmio_exits), STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, irq_window_exits), STATS_DESC_COUNTER(VCPU, nmi_window_exits), STATS_DESC_COUNTER(VCPU, l1d_flush), STATS_DESC_COUNTER(VCPU, halt_exits), STATS_DESC_COUNTER(VCPU, request_irq_exits), STATS_DESC_COUNTER(VCPU, irq_exits), STATS_DESC_COUNTER(VCPU, host_state_reload), STATS_DESC_COUNTER(VCPU, fpu_reload), STATS_DESC_COUNTER(VCPU, insn_emulation), STATS_DESC_COUNTER(VCPU, insn_emulation_fail), STATS_DESC_COUNTER(VCPU, hypercalls), STATS_DESC_COUNTER(VCPU, irq_injections), STATS_DESC_COUNTER(VCPU, nmi_injections), STATS_DESC_COUNTER(VCPU, req_event), STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), STATS_DESC_IBOOLEAN(VCPU, guest_mode), STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vcpu_stats_desc), }; u64 __read_mostly host_xcr0; static struct kmem_cache *x86_emulator_cache; /* * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; if (ignore_msrs) { if (report_ignored_msrs) kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, data); /* Mask the error */ return true; } else { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", op, msr, data); return false; } } static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); unsigned int size = sizeof(struct x86_emulate_ctxt); return kmem_cache_create_usercopy("x86_emulator", size, __alignof__(struct x86_emulate_ctxt), SLAB_ACCOUNT, useroffset, size - useroffset, NULL); } static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; unsigned long flags; /* * Disabling irqs at this point since the following code could be * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); if (msrs->registered) { msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { wrmsrl(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } } static int kvm_probe_user_return_msr(u32 msr) { u64 val; int ret; preempt_disable(); ret = rdmsrl_safe(msr, &val); if (ret) goto out; ret = wrmsrl_safe(msr, val); out: preempt_enable(); return ret; } int kvm_add_user_return_msr(u32 msr) { BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); if (kvm_probe_user_return_msr(msr)) return -1; kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (kvm_uret_msrs_list[i] == msr) return i; } return -1; } EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { rdmsrl_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; if (!msrs->registered) { msrs->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&msrs->urn); msrs->registered = true; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); if (msrs->registered) kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) { return kvm_apic_mode(kvm_get_apic_base(vcpu)); } EXPORT_SYMBOL_GPL(kvm_get_apic_mode); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; if (!msr_info->host_initiated) { if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) return 1; if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) return 1; } kvm_lapic_set_base(vcpu, msr_info->data); kvm_recalculate_apic_map(vcpu->kvm); return 0; } /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * * Hardware virtualization extension instructions may fault if a reboot turns * off virtualization while processes are running. Usually after catching the * fault we just panic; during reboot instead the instruction is ignored. */ noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 #define EXCPT_PF 2 static int exception_class(int vector) { switch (vector) { case PF_VECTOR: return EXCPT_PF; case DE_VECTOR: case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR: return EXCPT_CONTRIBUTORY; default: break; } return EXCPT_BENIGN; } #define EXCPT_FAULT 0 #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 #define EXCPT_DB 4 static int exception_type(int vector) { unsigned int mask; if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) return EXCPT_INTERRUPT; mask = 1 << vector; /* * #DBs can be trap-like or fault-like, the caller must check other CPU * state, e.g. DR6, to determine whether a #DB is a trap or fault. */ if (mask & (1 << DB_VECTOR)) return EXCPT_DB; if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) return EXCPT_ABORT; /* Reserved exceptions will result in fault */ return EXCPT_FAULT; } void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex) { if (!ex->has_payload) return; switch (ex->vector) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never * cleared by the processor". */ vcpu->arch.dr6 &= ~DR_TRAP_BITS; /* * In order to reflect the #DB exception payload in guest * dr6, three components need to be considered: active low * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, * DR6_BS and DR6_BT) * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. * In the target guest dr6: * FIXED_1 bits should always be set. * Active low bits should be cleared if 1-setting in payload. * Active high bits should be set if 1-setting in payload. * * Note, the payload is compatible with the pending debug * exceptions/exit qualification under VMX, that active_low bits * are active high in payload. * So they need to be flipped for DR6. */ vcpu->arch.dr6 |= DR6_ACTIVE_LOW; vcpu->arch.dr6 |= ex->payload; vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending * debug exceptions' field under VMX, not DR6. While bit 12 is * defined in the 'pending debug exceptions' field (enabled * breakpoint), it is reserved and must be zero in DR6. */ vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = ex->payload; break; } ex->has_payload = false; ex->payload = 0; } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, bool has_error_code, u32 error_code, bool has_payload, unsigned long payload) { struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; ex->vector = vector; ex->injected = false; ex->pending = true; ex->has_error_code = has_error_code; ex->error_code = error_code; ex->has_payload = has_payload; ex->payload = payload; } /* Forcibly leave the nested mode in cases like a vCPU reset */ static void kvm_leave_nested(struct kvm_vcpu *vcpu) { kvm_x86_ops.nested_ops->leave_nested(vcpu); } static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) { u32 prev_nr; int class1, class2; kvm_make_request(KVM_REQ_EVENT, vcpu); /* * If the exception is destined for L2 and isn't being reinjected, * morph it to a VM-Exit if L1 wants to intercept the exception. A * previously injected exception is not checked because it was checked * when it was original queued, and re-checking is incorrect if _L1_ * injected the exception, in which case it's exempt from interception. */ if (!reinject && is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); return; } if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: if (reinject) { /* * On VM-Entry, an exception can be pending if and only * if event injection was blocked by nested_run_pending. * In that case, however, vcpu_enter_guest() requests an * immediate exit, and the guest shouldn't proceed far * enough to need reinjection. */ WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); vcpu->arch.exception.injected = true; if (WARN_ON_ONCE(has_payload)) { /* * A reinjected event has already * delivered its payload. */ has_payload = false; payload = 0; } } else { vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; } vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); return; } /* to check exception */ prev_nr = vcpu->arch.exception.vector; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); class2 = exception_class(nr); if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { /* * Synthesize #DF. Clear the previously injected or pending * exception so as not to incorrectly trigger shutdown. */ vcpu->arch.exception.injected = false; vcpu->arch.exception.pending = false; kvm_queue_exception_e(vcpu, DF_VECTOR, 0); } else { /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception */ goto queue; } } void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { kvm_multiple_exception(vcpu, nr, true, error_code, true, payload, false); } int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else return kvm_skip_emulated_instruction(vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) { kvm_inject_gp(vcpu, 0); return 1; } return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | EMULTYPE_COMPLETE_USER_EXIT); } void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; /* * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of * whether or not L1 wants to intercept "regular" #PF. */ if (is_guest_mode(vcpu) && fault->async_page_fault) kvm_queue_exception_vmexit(vcpu, PF_VECTOR, true, fault->error_code, true, fault->address); else kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); } void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct kvm_mmu *fault_mmu; WARN_ON_ONCE(fault->vector != PF_VECTOR); fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : vcpu->arch.walk_mmu; /* * Invalidate the TLB entry for the faulting address, if it exists, * else the access will fault indefinitely (and to emulate hardware). */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, KVM_MMU_ROOT_CURRENT); fault_mmu->inject_page_fault(vcpu, fault); } EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) return true; kvm_queue_exception(vcpu, UD_VECTOR); return false; } EXPORT_SYMBOL_GPL(kvm_require_dr); static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; /* * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated * to an L1 GPA. */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, cr3 & GENMASK(11, 5), sizeof(pdpte)); if (ret < 0) return 0; for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { return 0; } } /* * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. * Shadow page roots need to be reconstructed instead. */ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { #ifdef CONFIG_X86_64 if (cr0 & 0xffffffff00000000UL) return false; #endif if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) return false; if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return false; return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); } void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { /* * CR0.WP is incorporated into the MMU role, but only for non-nested, * indirect shadow MMUs. If paging is disabled, no updates are needed * as there are no permission bits to emulate. If TDP is enabled, the * MMU's metadata needs to be updated, e.g. so that emulating guest * translations does the right thing, but there's no need to unload the * root as CR0.WP doesn't affect SPTEs. */ if ((cr0 ^ old_cr0) == X86_CR0_WP) { if (!(cr0 & X86_CR0_PG)) return; if (tdp_enabled) { kvm_init_mmu(vcpu); return; } } if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); /* * Clearing CR0.PG is defined to flush the TLB from the guest's * perspective. */ if (!(cr0 & X86_CR0_PG)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } EXPORT_SYMBOL_GPL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); if (!kvm_is_valid_cr0(vcpu, cr0)) return 1; cr0 |= X86_CR0_ET; /* Write to CR0 reserved bits are ignored, even on Intel. */ cr0 &= ~CR0_RESERVED_BITS; #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && (cr0 & X86_CR0_PG)) { int cs_db, cs_l; if (!is_pae(vcpu)) return 1; static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if (!(cr0 & X86_CR0_PG) && (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (cpu_feature_enabled(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, host_xss); } } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) { return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; } #endif static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; if (!(xcr0 & XFEATURE_MASK_FP)) return 1; if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) return 1; /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != (!(xcr0 & XFEATURE_MASK_BNDCSR))) return 1; if (xcr0 & XFEATURE_MASK_AVX512) { if (!(xcr0 & XFEATURE_MASK_YMM)) return 1; if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } if ((xcr0 & XFEATURE_MASK_XTILE) && ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) return 1; vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) kvm_update_cpuid_runtime(vcpu); return 0; } int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; } return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; return true; } EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { return __kvm_is_valid_cr4(vcpu, cr4) && static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) kvm_mmu_reset_context(vcpu); /* * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB * according to the SDM; however, stale prev_roots could be reused * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, * so fall through. */ if (!tdp_enabled && (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) kvm_mmu_unload(vcpu); /* * The TLB has to be flushed for all PCIDs if any of the following * (architecturally required) changes happen: * - CR4.PCIDE is changed from 1 to 0 * - CR4.PGE is toggled * * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. */ if (((cr4 ^ old_cr4) & X86_CR4_PGE) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); /* * The TLB has to be flushed for the current PCID if any of the * following (architecturally required) changes happen: * - CR4.SMEP is changed from 0 to 1 * - CR4.PAE is toggled */ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; if ((cr4 ^ old_cr4) & X86_CR4_LA57) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) return 1; } static_call(kvm_x86_set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots_to_free = 0; int i; /* * MOV CR3 and INVPCID are usually not intercepted when using TDP, but * this is reachable when running EPT=1 and unrestricted_guest=0, and * also via the emulator. KVM's TDP page tables are not in the scope of * the invalidation, but the guest's TLB entries need to be flushed as * the CPU may have cached entries in its TLB for the target PCID. */ if (unlikely(tdp_enabled)) { kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. */ if (kvm_get_active_pcid(vcpu) == pcid) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* * If PCID is disabled, there is no need to free prev_roots even if the * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB * with PCIDE=0. */ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) return; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); } int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { bool skip_tlb_flush = false; unsigned long pcid = 0; #ifdef CONFIG_X86_64 if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; pcid = cr3 & X86_CR3_PCID_MASK; } #endif /* PDPTRs are always reloaded for PAE paging. */ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) goto handle_tlb_flush; /* * Do not condition the GPA check on long mode, this helper is used to * stuff CR3, e.g. for RSM emulation, and there is no guarantee that * the current vCPU mode is accurate. */ if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) return 1; if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) return 1; if (cr3 != kvm_read_cr3(vcpu)) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* * A load of CR3 that flushes the TLB flushes only the current PCID, * even if PCID is disabled, in which case PCID=0 is flushed. It's a * moot point in the end because _disabling_ PCID will flush all PCIDs, * and it's impossible to use a non-zero PCID when PCID is disabled, * i.e. only PCID=0 can be relevant. */ if (!skip_tlb_flush) kvm_invalidate_pcid(vcpu, pcid); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; } EXPORT_SYMBOL_GPL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { int i; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; } } void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; static_call(kvm_x86_set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; case 4: case 6: if (!kvm_dr6_valid(val)) return 1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; case 5: default: /* 7 */ if (!kvm_dr7_valid(val)) return 1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); break; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_dr); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: return vcpu->arch.db[array_index_nospec(dr, size)]; case 4: case 6: return vcpu->arch.dr6; case 5: default: /* 7 */ return vcpu->arch.dr7; } } EXPORT_SYMBOL_GPL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { kvm_inject_gp(vcpu, 0); return 1; } kvm_rax_write(vcpu, (u32)data); kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that * require host support, i.e. should be probed via RDMSR. emulated_msrs holds * MSRs that KVM emulates without strictly requiring host support. * msr_based_features holds MSRs that enumerate features, i.e. are effectively * CPUID leafs. Note, msr_based_features isn't mutually exclusive with * msrs_to_save and emulated_msrs. */ static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, #ifdef CONFIG_KVM_HYPERV HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_SCONTROL, HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, HV_X64_MSR_SYNDBG_PENDING_BUFFER, #endif MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, /* * KVM always supports the "true" VMX control MSRs, even if the host * does not. The VMX MSRs as a whole are considered "emulated" as KVM * doesn't strictly require them to exist in the host (ignoring that * KVM would refuse to load in the first place if the core set of MSRs * aren't supported). */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, MSR_IA32_VMX_TRUE_EXIT_CTLS, MSR_IA32_VMX_TRUE_ENTRY_CTLS, MSR_IA32_VMX_MISC, MSR_IA32_VMX_CR0_FIXED0, MSR_IA32_VMX_CR4_FIXED0, MSR_IA32_VMX_VMCS_ENUM, MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, MSR_K7_HWCR, MSR_KVM_POLL_CONTROL, }; static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of MSRs that control the existence of MSR-based features, i.e. MSRs * that are effectively CPUID leafs. VMX MSRs are also included in the set of * feature MSRs, but are handled separately to allow expedited lookups. */ static const u32 msr_based_features_all_except_vmx[] = { MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; static unsigned int num_msr_based_features; /* * All feature MSRs except uCode revID, which tracks the currently loaded uCode * patch, are immutable once the vCPU model is defined. */ static bool kvm_is_immutable_feature_msr(u32 msr) { int i; if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) return true; for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { if (msr == msr_based_features_all_except_vmx[i]) return msr != MSR_IA32_UCODE_REV; } return false; } /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS * 11 - ENERGY_FILTERING_CTL * 12 - DOITM * 18 - FB_CLEAR_CTRL * 21 - XAPIC_DISABLE_STATUS * 23 - OVERCLOCKING_STATUS */ #define KVM_SUPPORTED_ARCH_CAP \ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) static u64 kvm_get_arch_capabilities(void) { u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. */ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) data |= ARCH_CAP_RDCL_NO; if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) data |= ARCH_CAP_SSB_NO; if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* * If RTM=0 because the kernel has disabled TSX, the host might * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 * and therefore knows that there cannot be TAA) but keep * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, * and we want to allow migrating those guests to tsx=off hosts. */ data &= ~ARCH_CAP_TAA_NO; } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; } else { /* * Nothing to do here; we emulate TSX_CTRL if present on the * host so the guest can choose between disabling TSX or * using VERW to clear CPU buffers. */ } if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) data |= ARCH_CAP_GDS_NO; return data; } static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_ARCH_CAPABILITIES: msr->data = kvm_get_arch_capabilities(); break; case MSR_IA32_PERF_CAPABILITIES: msr->data = kvm_caps.supported_perf_cap; break; case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; default: return static_call(kvm_x86_get_msr_feature)(msr); } return 0; } static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct kvm_msr_entry msr; int r; /* Unconditionally clear the output for simplicity */ msr.data = 0; msr.index = index; r = kvm_get_msr_feature(&msr); if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) r = 0; *data = msr.data; return r; } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) return false; if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && !guest_cpuid_has(vcpu, X86_FEATURE_LM)) return false; if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) return false; return true; } bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) return false; return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; int r; if (efer & efer_reserved_bits) return 1; if (!msr_info->host_initiated) { if (!__kvm_valid_efer(vcpu, efer)) return 1; if (is_paging(vcpu) && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) return 1; } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; r = static_call(kvm_x86_set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; } if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (!static_cpu_has(X86_FEATURE_XSAVES) && (efer & EFER_SVME)) kvm_hv_xsaves_xsavec_maybe_warn(vcpu); return 0; } void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { struct kvm_x86_msr_filter *msr_filter; struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; bool allowed; int idx; u32 i; /* x2APIC MSRs do not support filtering. */ if (index >= 0x800 && index <= 0x8ff) return true; idx = srcu_read_lock(&kvm->srcu); msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); if (!msr_filter) { allowed = true; goto out; } allowed = msr_filter->default_allow; ranges = msr_filter->ranges; for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { allowed = test_bit(index - start, bitmap); break; } } out: srcu_read_unlock(&kvm->srcu, idx); return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { struct msr_data msr; switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: case MSR_IA32_SYSENTER_ESP: /* * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if * non-canonical address is written on Intel but not on * AMD (which ignores the top 32-bits, because it does * not implement 64-bit SYSENTER). * * 64-bit code should hence be able to write a non-canonical * value on AMD. Making the address canonical ensures that * vmentry does not fail on Intel after writing a non-canonical * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; /* * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has * incomplete and conflicting architectural behavior. Current * AMD CPUs completely ignore bits 63:32, i.e. they aren't * reserved and always read as zeros. Enforce Intel's reserved * bits check if and only if the guest CPU is Intel, and clear * the bits in all other cases. This ensures cross-vendor * migration will provide consistent behavior for the guest. */ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) return 1; data = (u32)data; break; } msr.data = data; msr.index = index; msr.host_initiated = host_initiated; return static_call(kvm_x86_set_msr)(vcpu, &msr); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; } /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { struct msr_data msr; int ret; switch (index) { case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; break; } msr.index = index; msr.host_initiated = host_initiated; ret = static_call(kvm_x86_get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; } static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { int ret = __kvm_get_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } return ret; } static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { if (!vcpu->run->msr.error) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } } static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) { return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); } static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) { complete_userspace_rdmsr(vcpu); return complete_emulated_msr_access(vcpu); } static int complete_fast_msr_access(struct kvm_vcpu *vcpu) { return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) { complete_userspace_rdmsr(vcpu); return complete_fast_msr_access(vcpu); } static u64 kvm_msr_reason(int r) { switch (r) { case KVM_MSR_RET_INVALID: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; } } static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u32 exit_reason, u64 data, int (*completion)(struct kvm_vcpu *vcpu), int r) { u64 msr_reason = kvm_msr_reason(r); /* Check if the user wanted to know about this MSR fault */ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) return 0; vcpu->run->exit_reason = exit_reason; vcpu->run->msr.error = 0; memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); vcpu->run->msr.reason = msr_reason; vcpu->run->msr.index = index; vcpu->run->msr.data = data; vcpu->arch.complete_userspace_io = completion; return 1; } int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; int r; r = kvm_get_msr_with_filter(vcpu, ecx, &data); if (!r) { trace_kvm_msr_read(ecx, data); kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { /* MSR read failed? See if we should ask user space */ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, complete_fast_rdmsr, r)) return 0; trace_kvm_msr_read_ex(ecx); } return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); int r; r = kvm_set_msr_with_filter(vcpu, ecx, data); if (!r) { trace_kvm_msr_write(ecx, data); } else { /* MSR write failed? See if we should ask user space */ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, complete_fast_msr_access, r)) return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; trace_kvm_msr_write_ex(ecx, data); } return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } int kvm_emulate_invd(struct kvm_vcpu *vcpu) { /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_invd); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); } EXPORT_SYMBOL_GPL(kvm_emulate_mwait); int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); } EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } /* * The fast path for frequent and performance sensitive wrmsr emulation, * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces * the latency of virtual IPI by avoiding the expensive bits of transitioning * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the * other cases which must be called after interrupts are enabled on the host. */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) return 1; if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && ((u32)(data >> 32) != X2APIC_BROADCAST)) return kvm_x2apic_icr_write(vcpu->arch.apic, data); return 1; } static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) { if (!kvm_can_use_hv_timer(vcpu)) return 1; kvm_set_lapic_tscdeadline_msr(vcpu, data); return 0; } fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; kvm_vcpu_srcu_read_lock(vcpu); switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); ret = EXIT_FASTPATH_EXIT_HANDLED; } break; case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); ret = EXIT_FASTPATH_REENTER_GUEST; } break; default: break; } if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); kvm_vcpu_srcu_read_unlock(vcpu); return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); /* * Adapt set_msr() to msr_io()'s calling convention */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { u64 val; /* * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does * not support modifying the guest vCPU model on the fly, e.g. changing * the nVMX capabilities while L2 is running is nonsensical. Ignore * writes of the same value, e.g. to allow userspace to blindly stuff * all MSRs when emulating RESET. */ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { if (do_get_msr(vcpu, index, &val) || *data != val) return -EINVAL; return 0; } return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 struct pvclock_clock { int vclock_mode; u64 cycle_last; u64 mask; u32 mult; u32 shift; u64 base_cycles; u64 offset; }; struct pvclock_gtod_data { seqcount_t seq; struct pvclock_clock clock; /* extract of a clocksource struct */ struct pvclock_clock raw_clock; /* extract of a clocksource struct */ ktime_t offs_boot; u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; vdata->clock.offset = tk->tkr_mono.base; vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; vdata->raw_clock.shift = tk->tkr_raw.shift; vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; vdata->raw_clock.offset = tk->tkr_raw.base; vdata->wall_time_sec = tk->xtime_sec; vdata->offs_boot = tk->offs_boot; write_seqcount_end(&vdata->seq); } static s64 get_kvmclock_base_ns(void) { /* Count up from boot time, but with the frequency of the raw clock. */ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); } #else static s64 get_kvmclock_base_ns(void) { /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ return ktime_get_boottime_ns(); } #endif static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; struct pvclock_wall_clock wc; u32 wc_sec_hi; u64 wall_nsec; if (!wall_clock) return; r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); if (r) return; if (version & 1) ++version; /* first time write, random junk */ ++version; if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; wall_nsec = kvm_get_wall_clock_epoch(kvm); wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); if (sec_hi_ofs) { wc_sec_hi = wall_nsec >> 32; kvm_write_guest(kvm, wall_clock + sec_hi_ofs, &wc_sec_hi, sizeof(wc_sec_hi)); } version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, bool old_msr, bool host_initiated) { struct kvm_arch *ka = &vcpu->kvm->arch; if (vcpu->vcpu_id == 0 && !host_initiated) { if (ka->boot_vcpu_runs_old_kvmclock != old_msr) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = old_msr; } vcpu->arch.time = system_time; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (system_time & 1) kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, sizeof(struct pvclock_vcpu_time_info)); else kvm_gpc_deactivate(&vcpu->arch.pv_time); return; } static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); return dividend; } static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier) { uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; tps64 = base_hz; scaled64 = scaled_hz; while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) scaled64 >>= 1; else tps32 <<= 1; shift++; } *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); } #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; static u32 adjust_tsc_khz(u32 khz, s32 ppm) { u64 v = (u64)khz * (1000000 + ppm); do_div(v, 1000000); return v; } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; return 0; } else { pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } /* TSC scaling required - calculate ratio */ ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; } kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* * Compute the variation in TSC rate which is acceptable * within the range of tolerance and decide if the * rate being applied is within that bounds of the hardware * rate. If so, no scaling or compensation need be done. */ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.this_tsc_write; return tsc; } #ifdef CONFIG_X86_64 static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } #endif static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) { #ifdef CONFIG_X86_64 struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; /* * To use the masterclock, the host clocksource must be based on TSC * and all vCPUs must have matching TSCs. Note, the count for matching * vCPUs doesn't include the reference vCPU, hence "+1". */ bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)) && gtod_is_based_on_tsc(gtod->clock.vclock_mode); /* * Request a masterclock update if the masterclock needs to be toggled * on/off, or when starting a new generation and the masterclock is * enabled (compute_guest_tsc() requires the masterclock snapshot to be * taken _after_ the new generation is created). */ if ((ka->use_master_clock && new_generation) || (ka->use_master_clock != use_master_clock)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, atomic_read(&vcpu->kvm->online_vcpus), ka->use_master_clock, gtod->clock.vclock_mode); #endif } /* * Multiply tsc by a fixed point number represented by ratio. * * The most significant 64-N bits (mult) of ratio represent the * integral part of the fixed point number; the remaining N bits * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; } static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; } EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); vcpu->arch.l1_tsc_offset = l1_offset; /* * If we are here because L1 chose not to trap WRMSR to TSC then * according to the spec this should set L1's TSC (as opposed to * setting L1's offset for L2). */ if (is_guest_mode(vcpu)) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( l1_offset, static_call(kvm_x86_get_l2_tsc_offset)(vcpu), static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_offset = l1_offset; static_call(kvm_x86_write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) { vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; /* Userspace is changing the multiplier while L2 is active */ if (is_guest_mode(vcpu)) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( l1_multiplier, static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) { #ifdef CONFIG_X86_64 /* * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); } /* * Infers attempts to synchronize the guest's tsc from host writes. Sets the * offset for the vcpu and tracks the TSC matching generation that the vcpu * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, u64 ns, bool matched) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = tsc; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; kvm->arch.last_tsc_offset = offset; vcpu->arch.last_guest_tsc = tsc; kvm_vcpu_write_tsc_offset(vcpu, offset); if (!matched) { /* * We split periods of matched TSC writes into generations. * For each generation, we track the original measured * nanosecond time, offset, and write, so if TSCs are in * sync, we can match exact offset, and if not, we can match * exact software computation in compute_guest_tsc() * * These values are tracked in kvm->arch.cur_xxx variables. */ kvm->arch.cur_tsc_generation++; kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = tsc; kvm->arch.cur_tsc_offset = offset; kvm->arch.nr_vcpus_matched_tsc = 0; } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { kvm->arch.nr_vcpus_matched_tsc++; } /* Keep track of which generation this VCPU has synchronized to */ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; kvm_track_tsc_matching(vcpu, !matched); } static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* * Force synchronization when creating a vCPU, or when * userspace explicitly writes a zero value. */ synchronizing = true; } else if (kvm->arch.user_set_tsc) { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* * Here lies UAPI baggage: when a user-initiated TSC write has * a small delta (1 second) of virtual cycle time against the * previously set vCPU, we assume that they were intended to be * in sync and the delta was only due to the racy nature of the * legacy API. * * This trick falls down when restoring a guest which genuinely * has been running for less time than the 1 second of imprecision * which we allow for in the legacy API. In this case, the first * value written by userspace (on any vCPU) should not be subject * to this 'correction' to make it sync up with values that only * come from the kernel's default vCPU creation. Make the 1-second * slop hack only trigger if the user_set_tsc flag is already set. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } if (user_value) kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; } __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } #ifdef CONFIG_X86_64 static u64 read_tsc(void) { u64 ret = (u64)rdtsc_ordered(); u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) return ret; /* * GCC likes to generate cmov here, but this branch is extremely * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function * ever gets inlined it will generate worse code. */ asm volatile (""); return last; } static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { u64 tsc_pg_val; long v; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: if (hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp, &tsc_pg_val)) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & clock->mask; } else { /* TSC page invalid */ *mode = VDSO_CLOCKMODE_NONE; } break; case VDSO_CLOCKMODE_TSC: *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - clock->cycle_last) & clock->mask; break; default: *mode = VDSO_CLOCKMODE_NONE; } if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; return v * clock->mult; } /* * As with get_kvmclock_base_ns(), this counts from boot time, at the * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). */ static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ns = gtod->raw_clock.base_cycles; ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode); ns >>= gtod->raw_clock.shift; ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); *t = ns; return mode; } /* * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with * no boot time offset. */ static int do_monotonic(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ns = gtod->clock.base_cycles; ns += vgettsc(&gtod->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; ns += ktime_to_ns(gtod->clock.offset); } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); *t = ns; return mode; } static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ts->tv_sec = gtod->wall_time_sec; ns = gtod->clock.base_cycles; ns += vgettsc(&gtod->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; return mode; } /* * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and * reports the TSC value from which it do so. Returns true if host is * using TSC based clocksource. */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, tsc_timestamp)); } /* * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did * so. Returns true if host is using TSC based clocksource. */ bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; return gtod_is_based_on_tsc(do_monotonic(kernel_ns, tsc_timestamp)); } /* * Calculates CLOCK_REALTIME and reports the TSC value from which it did * so. Returns true if host is using TSC based clocksource. * * DO NOT USE this for anything related to migration. You want CLOCK_TAI * for that. */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); } #endif /* * * Assuming a stable TSC across physical CPUS, and a stable TSC * across virtual CPUs, the following condition is possible. * Each numbered line represents an event visible to both * CPUs at the next numbered event. * * "timespecX" represents host monotonic time. "tscX" represents * RDTSC value. * * VCPU0 on CPU0 | VCPU1 on CPU1 * * 1. read timespec0,tsc0 * 2. | timespec1 = timespec0 + N * | tsc1 = tsc0 + M * 3. transition to guest | transition to guest * 4. ret0 = timespec0 + (rdtsc - tsc0) | * 5. | ret1 = timespec1 + (rdtsc - tsc1) * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) * * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: * * - ret0 < ret1 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) * ... * - 0 < N - M => M < N * * That is, when timespec0 != timespec1, M < N. Unfortunately that is not * always the case (the difference between two distinct xtime instances * might be smaller then the difference between corresponding TSC reads, * when updating guest vcpus pvclock areas). * * To avoid that problem, do not allow visibility of distinct * system_timestamp/tsc_timestamp values simultaneously: use a master * copy of host monotonic time values. Update that master copy * in lockstep. * * Rely on synchronization of host TSCs and guest TSCs for monotonicity. * */ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) { #ifdef CONFIG_X86_64 struct kvm_arch *ka = &kvm->arch; int vclock_mode; bool host_tsc_clocksource, vcpus_matched; lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ host_tsc_clocksource = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); vclock_mode = pvclock_gtod_data.clock.vclock_mode; trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, vcpus_matched); #endif } static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } static void __kvm_start_pvclock_update(struct kvm *kvm) { raw_spin_lock_irq(&kvm->arch.tsc_write_lock); write_seqcount_begin(&kvm->arch.pvclock_sc); } static void kvm_start_pvclock_update(struct kvm *kvm) { kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ __kvm_start_pvclock_update(kvm); } static void kvm_end_pvclock_update(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct kvm_vcpu *vcpu; unsigned long i; write_seqcount_end(&ka->pvclock_sc); raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); } static void kvm_update_masterclock(struct kvm *kvm) { kvm_hv_request_tsc_page_update(kvm); kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); kvm_end_pvclock_update(kvm); } /* * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz * can change during boot even if the TSC is constant, as it's possible for KVM * to be loaded before TSC calibration completes. Ideally, KVM would get a * notification when calibration completes, but practically speaking calibration * will complete before userspace is alive enough to create VMs. */ static unsigned long get_cpu_tsc_khz(void) { if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) return tsc_khz; else return __this_cpu_read(cpu_tsc_khz); } /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); data->flags = 0; if (ka->use_master_clock && (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; } else #endif data->host_tsc = rdtsc(); data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); } else { data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; } put_cpu(); } static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; unsigned seq; do { seq = read_seqcount_begin(&ka->pvclock_sc); __get_kvmclock(kvm, data); } while (read_seqcount_retry(&ka->pvclock_sc, seq)); } u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_clock_data data; get_kvmclock(kvm, &data); return data.clock; } static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, struct gfn_to_pfn_cache *gpc, unsigned int offset, bool force_tsc_unstable) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; unsigned long flags; read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) return; read_lock_irqsave(&gpc->lock, flags); } guest_hv_clock = (void *)(gpc->khva + offset); /* * This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. */ guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); if (force_tsc_unstable) guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; smp_wmb(); guest_hv_clock->version = ++vcpu->hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; u8 pvclock_flags; bool use_master_clock; #ifdef CONFIG_KVM_XEN /* * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless * explicitly told to use TSC as its clocksource Xen will not set this bit. * This default behaviour led to bugs in some guest kernels which cause * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. */ bool xen_pvclock_tsc_unstable = ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; #endif kernel_ns = 0; host_tsc = 0; /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ do { seq = read_seqcount_begin(&ka->pvclock_sc); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } if (!use_master_clock) { host_tsc = rdtsc(); kernel_ns = get_kvmclock_base_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate * 2) Broken TSC compensation resets the base at each VCPU * entry to avoid unknown leaps of TSC even when running * again on the same CPU. This may cause apparent elapsed * time to disappear, and the guest to stand still or run * very slowly. */ if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } local_irq_restore(flags); /* With all the info we got, fill in the values */ if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; kvm_xen_update_tsc_info(v); } vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time), xen_pvclock_tsc_unstable); if (vcpu->xen.vcpu_time_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, xen_pvclock_tsc_unstable); #endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } /* * The pvclock_wall_clock ABI tells the guest the wall clock time at * which it started (i.e. its epoch, when its kvmclock was zero). * * In fact those clocks are subtly different; wall clock frequency is * adjusted by NTP and has leap seconds, while the kvmclock is a * simple function of the TSC without any such adjustment. * * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between * that and kvmclock, but even that would be subject to change over * time. * * Attempt to calculate the epoch at a given moment using the *same* * TSC reading via kvm_get_walltime_and_clockread() to obtain both * wallclock and kvmclock times, and subtracting one from the other. * * Fall back to using their values at slightly different moments by * calling ktime_get_real_ns() and get_kvmclock_ns() separately. */ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) { #ifdef CONFIG_X86_64 struct pvclock_vcpu_time_info hv_clock; struct kvm_arch *ka = &kvm->arch; unsigned long seq, local_tsc_khz; struct timespec64 ts; uint64_t host_tsc; do { seq = read_seqcount_begin(&ka->pvclock_sc); local_tsc_khz = 0; if (!ka->use_master_clock) break; /* * The TSC read and the call to get_cpu_tsc_khz() must happen * on the same CPU. */ get_cpu(); local_tsc_khz = get_cpu_tsc_khz(); if (local_tsc_khz && !kvm_get_walltime_and_clockread(&ts, &host_tsc)) local_tsc_khz = 0; /* Fall back to old method */ put_cpu(); /* * These values must be snapshotted within the seqcount loop. * After that, it's just mathematics which can happen on any * CPU at any time. */ hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* * If the conditions were right, and obtaining the wallclock+TSC was * successful, calculate the KVM clock at the corresponding time and * subtract one from the other to get the guest's epoch in nanoseconds * since 1970-01-01. */ if (local_tsc_khz) { kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - __pvclock_read_cycles(&hv_clock, host_tsc); } #endif return ktime_get_real_ns() - get_kvmclock_ns(kvm); } /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from * the rest of the vcpus to remain static. Otherwise ntp frequency * correction applies to one vcpu's system_timestamp but not * the others. * * So in those cases, request a kvmclock update for all vcpus. * We need to rate-limit these requests though, as they can * considerably slow guests that have a large number of vcpus. * The time for a remote vcpu to update its kvmclock is bound * by the delay we use to rate-limit the updates. */ #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) static void kvmclock_update_fn(struct work_struct *work) { unsigned long i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_vcpu_kick(vcpu); } } static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); } #define KVMCLOCK_SYNC_PERIOD (300 * HZ) static void kvmclock_sync_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) { return (msr & 3) == 0; } static bool is_mci_status_msr(u32 msr) { return (msr & 3) == 1; } /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; } static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: if (!(mcg_cap & MCG_CTL_P) && (data || !msr_info->host_initiated)) return 1; if (data != 0 && data != ~(u64)0) return 1; vcpu->arch.mcg_ctl = data; break; case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; if (msr > last_msr) return 1; if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) return 1; /* An attempt to write a 1 to a reserved bit raises #GP */ if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) return 1; offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, last_msr + 1 - MSR_IA32_MC0_CTL2); vcpu->arch.mci_ctl2_banks[offset] = data; break; case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; if (msr > last_msr) return 1; /* * Only 0 or all 1s can be written to IA32_MCi_CTL, all other * values are architecturally undefined. But, some Linux * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB * issue on AMD K8s, allow bit 10 to be clear when setting all * other bits in order to avoid an uncaught #GP in the guest. * * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, * single-bit ECC data errors. */ if (is_mci_control_msr(msr) && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return 1; /* * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. * AMD-based CPUs allow non-zero values, but if and only if * HWCR[McStatusWrEn] is set. */ if (!msr_info->host_initiated && is_mci_status_msr(msr) && data != 0 && !can_set_mci_status(vcpu)) return 1; offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, last_msr + 1 - MSR_IA32_MC0_CTL); vcpu->arch.mce_banks[offset] = data; break; default: return 1; } return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; return (vcpu->arch.apf.msr_en_val & mask) == mask; } static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; /* Bits 4:5 are reserved, Should be zero */ if (data & 0x30) return 1; if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) return 1; if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) return 1; if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) { /* Bits 8-63 are reserved */ if (data >> 8) return 1; if (!lapic_in_kernel(vcpu)) return 1; vcpu->arch.apf.msr_int_val = data; vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; return 0; } static void kvmclock_reset(struct kvm_vcpu *vcpu) { kvm_gpc_deactivate(&vcpu->arch.pv_time); vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_flush_tlb_all)(vcpu); /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; if (!tdp_enabled) { /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires * a forced sync of the shadow page tables. Ensure all the * roots are synced and the guest TLB in hardware is clean. */ kvm_mmu_sync_roots(vcpu); kvm_mmu_sync_prev_roots(vcpu); } static_call(kvm_x86_flush_tlb_guest)(vcpu); /* * Flushing all "guest" TLB is always a superset of Hyper-V's fine * grained flushing. */ kvm_hv_vcpu_purge_flush_tlb(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_flush_tlb_current)(vcpu); } /* * Service "local" TLB flush requests, which are specific to the current MMU * context. In addition to the generic event handling in vcpu_enter_guest(), * TLB flushes that are targeted at an MMU context also need to be serviced * prior before nested VM-Enter/VM-Exit. */ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) { if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); } EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; struct kvm_steal_time __user *st; struct kvm_memslots *slots; gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; u64 steal; u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); return; } if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; slots = kvm_memslots(vcpu->kvm); if (unlikely(slots->generation != ghc->generation || gpa != ghc->gpa || kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { /* We rely on the fact that it fits in a single page. */ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || kvm_is_error_hva(ghc->hva) || !ghc->memslot) return; } st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { u8 st_preempted = 0; int err = -EFAULT; if (!user_access_begin(st, sizeof(*st))) return; asm volatile("1: xchgb %0, %2\n" "xor %1, %1\n" "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+q" (st_preempted), "+&r" (err), "+m" (st->preempted)); if (err) goto out; user_access_end(); vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); if (!user_access_begin(st, sizeof(*st))) goto dirty; } else { if (!user_access_begin(st, sizeof(*st))) return; unsafe_put_user(0, &st->preempted, out); vcpu->arch.st.preempted = 0; } unsafe_get_user(version, &st->version, out); if (version & 1) version += 1; /* first time write, random junk */ version += 1; unsafe_put_user(version, &st->version, out); smp_wmb(); unsafe_get_user(steal, &st->steal, out); steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; unsafe_put_user(steal, &st->steal, out); version += 1; unsafe_put_user(version, &st->version, out); out: user_access_end(); dirty: mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } static bool kvm_is_msr_to_save(u32 msr_index) { unsigned int i; for (i = 0; i < num_msrs_to_save; i++) { if (msrs_to_save[i] == msr_index) return true; } return false; } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; u64 data = msr_info->data; if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; case MSR_IA32_UCODE_REV: if (msr_info->host_initiated) vcpu->arch.microcode_version = data; break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated) return 1; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated) return 1; if (data & ~kvm_caps.supported_perf_cap) return 1; /* * Note, this is not just a performance optimization! KVM * disallows changing feature MSRs after the vCPU has run; PMU * refresh will bug the VM if called after the vCPU has run. */ if (vcpu->arch.perf_capabilities == data) break; vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); break; case MSR_IA32_PRED_CMD: { u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); if (!msr_info->host_initiated) { if ((!guest_has_pred_cmd_msr(vcpu))) return 1; if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) reserved_bits |= PRED_CMD_IBPB; if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) reserved_bits |= PRED_CMD_SBPB; } if (!boot_cpu_has(X86_FEATURE_IBPB)) reserved_bits |= PRED_CMD_IBPB; if (!boot_cpu_has(X86_FEATURE_SBPB)) reserved_bits |= PRED_CMD_SBPB; if (data & reserved_bits) return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) return 1; if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) return 1; if (!data) break; wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ /* * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 * through at least v6.6 whine if TscFreqSel is clear, * depending on F/M/S. */ if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } vcpu->arch.msr_hwcr = data; break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) return 1; vcpu->arch.pat = data; break; case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); /* Before back to guest, tsc_timestamp must be adjusted * as well, otherwise guest's percpu pvclock time could jump. */ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; if (!msr_info->host_initiated) { /* RO bits */ if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) return 1; /* R bits, i.e. writes are ignored, but don't fault. */ data = data & ~MSR_IA32_MISC_ENABLE_EMON; data |= old_val & MSR_IA32_MISC_ENABLE_EMON; } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } break; } case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; case MSR_IA32_POWER_CTL: vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); } else { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* * KVM supports exposing PT to the guest, but does not support * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) return 1; if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) return 1; if (unlikely(!sched_info_on())) return 1; if (data & KVM_STEAL_RESERVED_MASK) return 1; vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) break; kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) return 1; /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; vcpu->arch.msr_kvm_poll_control = data; break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (data) kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. * Writes are only relevant for old K7 processors, * all pre-dating SVM, but a recommended workaround from * AMD for these chips. It is possible to specify the * affected processor models on the command line, hence * the need to ignore the workaround. */ break; #ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); #endif case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; vcpu->arch.msr_platform_info = data; break; case MSR_MISC_FEATURES_ENABLES: if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && !supports_cpuid_fault(vcpu))) return 1; vcpu->arch.msr_misc_features_enables = data; break; #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) return 1; fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) return 1; vcpu->arch.guest_fpu.xfd_err = data; break; #endif default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); /* * Userspace is allowed to write '0' to MSRs that KVM reports * as to-be-saved, even if an MSRs isn't fully supported. */ if (msr_info->host_initiated && !data && kvm_is_msr_to_save(msr)) break; return KVM_MSR_RET_INVALID; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: data = 0; break; case MSR_IA32_MCG_CAP: data = vcpu->arch.mcg_cap; break; case MSR_IA32_MCG_CTL: if (!(mcg_cap & MCG_CTL_P) && !host) return 1; data = vcpu->arch.mcg_ctl; break; case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; if (msr > last_msr) return 1; if (!(mcg_cap & MCG_CMCI_P) && !host) return 1; offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, last_msr + 1 - MSR_IA32_MC0_CTL2); data = vcpu->arch.mci_ctl2_banks[offset]; break; case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; if (msr > last_msr) return 1; offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, last_msr + 1 - MSR_IA32_MC0_CTL); data = vcpu->arch.mce_banks[offset]; break; default: return 1; } *pdata = data; return 0; } int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power * limit) MSRs. Just return 0, as we do not want to expose the host * data here. Do not conditionalize this on CPUID, as KVM does not do * so for existing CPU-specific MSRs. */ case MSR_RAPL_POWER_UNIT: case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ case MSR_PKG_ENERGY_STATUS: /* Total package */ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) return 1; msr_info->data = vcpu->arch.perf_capabilities; break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; case MSR_IA32_TSC: { /* * Intel SDM states that MSR_IA32_TSC read adds the TSC offset * even when not intercepted. AMD manual doesn't explicitly * state this but appears to behave the same. * * On userspace reads and writes, however, we unconditionally * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ u64 offset, ratio; if (msr_info->host_initiated) { offset = vcpu->arch.l1_tsc_offset; ratio = vcpu->arch.l1_tsc_scaling_ratio; } else { offset = vcpu->arch.tsc_offset; ratio = vcpu->arch.tsc_scaling_ratio; } msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } case MSR_IA32_CR_PAT: msr_info->data = vcpu->arch.pat; break; case MSR_MTRRcap: case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; break; /* * MSR_EBC_FREQUENCY_ID * Conservative value valid for even the basic CPU models. * Models 0,1: 000 in bits 23:21 indicating a bus speed of * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, * and 266MHz for model 3, or 4. Set Core Clock * Frequency to System Bus Frequency Ratio to 1 (bits * 31:24) even though these are only valid for CPU * models > 2, however guests may end up dividing or * multiplying by zero otherwise. */ case MSR_EBC_FREQUENCY_ID: msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; case MSR_SMI_COUNT: msr_info->data = vcpu->arch.smi_count; break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ msr_info->data = 1000ULL; /* CPU multiplier */ msr_info->data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; msr_info->data = vcpu->arch.time; break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) return 1; msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) return 1; msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) return 1; msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; msr_info->data = vcpu->arch.ia32_xss; break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other * are set to zero, indicating minimum divisors for * every field. * * This prevents guest kernels on AMD host with CPU * type 6, model 8 and higher from exploding due to * the rdmsr failing. */ msr_info->data = 0x20000000; break; #ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); #endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow * scenarios where it sets bit #19, itself documented as * a "reserved" bit. Best effort attempt to source coherent * read data here should the balance of the register be * interpreted by the guest: * * L2 cache control register 3: 64GB range, 256KB size, * enabled, latency 0x1, configured */ msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated && !vcpu->kvm->arch.guest_can_read_msr_platform_info) return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: msr_info->data = vcpu->arch.msr_misc_features_enables; break; case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); /* * Userspace is allowed to read MSRs that KVM reports as * to-be-saved, even if an MSR isn't fully supported. */ if (msr_info->host_initiated && kvm_is_msr_to_save(msr_info->index)) { msr_info->data = 0; break; } return KVM_MSR_RET_INVALID; } return 0; } EXPORT_SYMBOL_GPL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. * * @return number of msrs set successfully. */ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { int i; for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; return i; } /* * Read or write a bunch of msrs. Parameters are user addresses. * * @return number of msrs set successfully. */ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) { struct kvm_msrs msrs; struct kvm_msr_entry *entries; unsigned size; int r; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) goto out; r = -E2BIG; if (msrs.nmsrs >= MAX_IO_MSRS) goto out; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; entries = memdup_user(user_msrs->entries, size); if (IS_ERR(entries)) { r = PTR_ERR(entries); goto out; } r = __msr_io(vcpu, &msrs, entries, do_msr); if (writeback && copy_to_user(user_msrs->entries, entries, size)) r = -EFAULT; kfree(entries); out: return r; } static inline bool kvm_can_mwait_in_guest(void) { return boot_cpu_has(X86_FEATURE_MWAIT) && !boot_cpu_has_bug(X86_BUG_MONITOR) && boot_cpu_has(X86_FEATURE_ARAT); } #ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) { struct kvm_cpuid2 cpuid; int r; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) return r; r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); if (r) return r; r = -EFAULT; if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) return r; return 0; } #endif static bool kvm_is_vm_type_supported(unsigned long type) { return type == KVM_X86_DEFAULT_VM || (type == KVM_X86_SW_PROTECTED_VM && IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; switch (ext) { case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: #ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_TIME: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: #endif case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: #ifdef CONFIG_X86_SGX_KVM case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; #ifdef CONFIG_KVM_XEN case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO | KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r = KVM_X86_DISABLE_EXITS_PAUSE; if (!mitigate_smt_rsb) { r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_CSTATE; if (kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; } break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) break; /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is * emulated via vm86 mode. Still, do not go to great lengths * to avoid userspace's usage of the feature, because it is a * fringe case that is not enabled except via specific settings * of the module parameters. */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; case KVM_CAP_XCRS: r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops.nested_ops->get_state ? kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; #ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; #endif case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; case KVM_CAP_STEAL_TIME: r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else r = 0; break; case KVM_CAP_XSAVE2: { r = xstate_required_size(kvm_get_filtered_xcr0(), false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; break; case KVM_CAP_VM_TYPES: r = BIT(KVM_X86_DEFAULT_VM); if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) r |= BIT(KVM_X86_SW_PROTECTED_VM); break; default: break; } return r; } static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) { void __user *uaddr = (void __user*)(unsigned long)attr->addr; if ((u64)(unsigned long)uaddr != attr->addr) return ERR_PTR_USR(-EFAULT); return uaddr; } static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) { u64 __user *uaddr = kvm_get_attr_addr(attr); if (attr->group) return -ENXIO; if (IS_ERR(uaddr)) return PTR_ERR(uaddr); switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: return -ENXIO; } } static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) { if (attr->group) return -ENXIO; switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: return 0; default: return -ENXIO; } } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; long r; switch (ioctl) { case KVM_GET_MSR_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; struct kvm_msr_list msr_list; unsigned n; r = -EFAULT; if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, num_emulated_msrs * sizeof(u32))) goto out; r = 0; break; } case KVM_GET_SUPPORTED_CPUID: case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, ioctl); if (r) goto out; r = -EFAULT; if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_caps.supported_mce_cap, sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; case KVM_GET_MSR_FEATURE_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; struct kvm_msr_list msr_list; unsigned int n; r = -EFAULT; if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msr_based_features; if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msr_based_features, num_msr_based_features * sizeof(u32))) goto out; r = 0; break; } case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; #endif case KVM_GET_DEVICE_ATTR: { struct kvm_device_attr attr; r = -EFAULT; if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) break; r = kvm_x86_dev_get_attr(&attr); break; } case KVM_HAS_DEVICE_ATTR: { struct kvm_device_attr attr; r = -EFAULT; if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) break; r = kvm_x86_dev_has_attr(&attr); break; } default: r = -EINVAL; break; } out: return r; } static void wbinvd_ipi(void *garbage) { wbinvd(); } static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { if (static_call(kvm_x86_has_wb